From 897495fbe517c3977d9f717ea2687be6372b3d3d Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 24 May 2024 23:49:57 +0800 Subject: [PATCH 01/71] kernel v0 done --- vllm/lora/ops/__init__.py | 0 vllm/lora/ops/sgmv_expand.py | 134 +++++++++++++++++++++++++++++++++++ vllm/lora/ops/sgmv_shrink.py | 134 +++++++++++++++++++++++++++++++++++ vllm/lora/punica.py | 3 +- 4 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 vllm/lora/ops/__init__.py create mode 100644 vllm/lora/ops/sgmv_expand.py create mode 100644 vllm/lora/ops/sgmv_shrink.py diff --git a/vllm/lora/ops/__init__.py b/vllm/lora/ops/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py new file mode 100644 index 000000000000..b6bcca9fe8d2 --- /dev/null +++ b/vllm/lora/ops/sgmv_expand.py @@ -0,0 +1,134 @@ +import triton +import triton.language as tl +import torch + + +@triton.jit +def _sgmv_expand_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + b_seq_start_loc, + seq_lens, + lora_indices, + xm_stride, + xk_stride, # 1 + l0_stride, # hidden_size*max_rank + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, +): + pid = tl.program_id(axis=0) + cta_n_num = tl.cdiv(N, BLOCK_N) + pid_m = pid // cta_n_num + pid_n = pid % cta_n_num + + cur_batch = tl.program_id(axis=1) + M = tl.load(seq_lens + cur_batch) + if pid_m * BLOCK_M > M: + return + cur_seq_start = tl.load(b_seq_start_loc + cur_batch) + offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = tl.arange(0, BLOCK_K) + ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + lora_index = tl.load(lora_indices + cur_batch) + + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride, ) + b_ptr = (lora_ptr + l0_stride * lora_index + + offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride) + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(tl.cdiv(K, BLOCK_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + tiled_a = tl.load(a_ptr, + mask=offset_k[None, :] < K - k * BLOCK_K, + other=0) + tiled_b = tl.load(b_ptr, + mask=offset_k[:, None] < K - k * BLOCK_K, + other=0) + accumulator += tl.dot( + tiled_a, + tiled_b, + ) + a_ptr += BLOCK_K * xk_stride + b_ptr += BLOCK_K * lora_n_stride + tiled_c = accumulator.to(input_ptr.dtype.element_ty) + offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride) + M = tl.load(seq_lens + cur_batch) + c_mask = (offset_cm[:, None] < + (cur_seq_start + M)) & (offset_cn[None, :] < N) + tl.store(c_ptr, tiled_c, mask=c_mask) + + +@torch.inference_mode() +def sgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + max_seq_length: int, +): + """_summary_ + + Args: + inputs (torch.Tensor): _description_ + lora_b_weights (torch.Tensor): _description_ + output_tensor (torch.Tensor): _description_ + b_seq_start_loc (torch.Tensor): _description_ + seq_len_tensor (torch.Tensor): _description_ + lora_indices_tensor (torch.Tensor): _description_ + batchs (int): _description_ + max_seq_length (int): _description_ + """ + _, N, K = lora_b_weights.shape # K= rank,N=hidden_size + + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_K = 16 + EVEN_K = K % BLOCK_K == 0 + + grid = [ + triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + batchs, + ] + + _sgmv_expand_kernel[grid]( + inputs, + lora_b_weights, + output_tensor, + N, + K, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + inputs.stride(0), + inputs.stride(1), + lora_b_weights.stride(0), + lora_b_weights.stride(1), + lora_b_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + ) + return diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py new file mode 100644 index 000000000000..595c93b89c54 --- /dev/null +++ b/vllm/lora/ops/sgmv_shrink.py @@ -0,0 +1,134 @@ +import triton +import triton.language as tl +import torch + + +@triton.jit +def _sgmv_shrink_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + b_seq_start_loc, + seq_lens, + lora_indices, + xm_stride, # hidden_size + xk_stride, # 1 + l0_stride, # hidden_size*max_rank + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, +): + pid = tl.program_id(axis=0) + pid_sk = tl.program_id(axis=1) + cur_batch = tl.program_id(axis=2) + cta_n_num = tl.cdiv(N, BLOCK_N) + pid_m = pid // cta_n_num + pid_n = pid % cta_n_num + + M = tl.load(seq_lens + cur_batch) + if pid_m * BLOCK_M > M: + return + cur_seq_start = tl.load(b_seq_start_loc + cur_batch) + offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) + + ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + lora_index = tl.load(lora_indices + cur_batch) + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride) + b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride) + + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(a_ptr) + b = tl.load(b_ptr) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0) + b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0) + accumulator += tl.dot(a, b) + a_ptr += BLOCK_K * SPLIT_K * xk_stride + b_ptr += BLOCK_K * SPLIT_K * lora_n_stride + offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride) + + c_mask = (offset_cm[:, None] < + (cur_seq_start + M)) & (offset_cn[None, :] < N) + if SPLIT_K == 1: + tl.store(c_ptr, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptr, accumulator, mask=c_mask) + + +@torch.inference_mode() +def sgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + max_seq_length: int, +): + """_summary_ + + Args: + inputs (torch.Tensor): _description_ + lora_a_weights (torch.Tensor): _description_ + output_tensor (torch.Tensor): _description_ + b_seq_start_loc (torch.Tensor): _description_ + seq_len_tensor (torch.Tensor): _description_ + lora_indices_tensor (torch.Tensor): _description_ + batchs (int): _description_ + max_seq_length (int): _description_ + """ + _, N, K = lora_a_weights.shape # K=hidden_size,N=rank + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_K = 32 + SPLIT_K = 8 + EVEN_K = K % (SPLIT_K * BLOCK_K) == 0 + + grid = [ + triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + SPLIT_K, + batchs, + ] + _sgmv_shrink_kernel[grid]( + inputs, + lora_a_weights, + output_tensor, + N, + K, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + inputs.stride(0), + inputs.stride(1), + lora_a_weights.stride(0), + lora_a_weights.stride(1), + lora_a_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + ) + return diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index c87bed54726f..8957b6168304 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -1,8 +1,9 @@ # Based on code from https://github.com/punica-ai/punica from typing import Optional - import torch +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_shrink import sgmv_shrink def _raise_import_error(e): From e50234ee32fa89ac41240d1d0dc5255d7dd78482 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Sun, 26 May 2024 00:26:32 +0800 Subject: [PATCH 02/71] add temp_test.py --- vllm/lora/ops/temp_test.py | 141 +++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 vllm/lora/ops/temp_test.py diff --git a/vllm/lora/ops/temp_test.py b/vllm/lora/ops/temp_test.py new file mode 100644 index 000000000000..79464266883b --- /dev/null +++ b/vllm/lora/ops/temp_test.py @@ -0,0 +1,141 @@ +import torch + +import pytest +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_shrink import sgmv_shrink + + + + +def ref_torch_groupgemm( + x_ptr, + lora_ptr, + batchs, + lora_indices_tensor, + seq_len_tensor, +) -> torch.Tensor: + out_list = [] + + current_offset = 0 + for lora_index, b_length in zip(range(batchs), seq_len_tensor): + input_weight = x_ptr[current_offset : b_length + current_offset, :] + current_offset += b_length + lora_weight = lora_ptr[lora_indices_tensor[lora_index]] + result = torch.nn.functional.linear(input_weight, lora_weight) + out_list.append(result) + out = torch.cat(out_list, dim=0) + return out + + +@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)]) +@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424]) +@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128]) +@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32]) +@torch.inference_mode() +def test_shrink_kernel(batchs, hidden_size, lora_nums, max_rank, dtype): + SEED = [0xABCDABCD987] + torch.manual_seed(SEED[0]) + if batchs == 0: + batchs += 1 + + seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda() + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).cuda() + total_tokens = seq_len_tensor.sum() + + inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).cuda() + lora_a_weights = torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).cuda() + + lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda() + output_tensor = torch.zeros( + total_tokens, max_rank, dtype=torch.float32 + ).cuda() + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + + sgmv_shrink( + inputs_tensor, + lora_a_weights, + output_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + ) + torch.cuda.synchronize() + torch_out_tensor = ref_torch_groupgemm( + inputs_tensor, + lora_a_weights, + batchs, + lora_indices_tensor, + seq_len_tensor, + ) + torch_out_tensor = torch_out_tensor.to(torch.float32) + assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2) + +@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)]) +@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424]) +@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128]) +@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32]) +@torch.inference_mode() +def test_expand_kernel(batchs, hidden_size, lora_nums, max_rank, dtype): + SEED = [0xABCDABCD987] + torch.manual_seed(SEED[0]) + if batchs == 0: + batchs += 1 + + seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda() + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).cuda() + total_tokens = seq_len_tensor.sum() + + inputs_tensor = torch.rand((total_tokens, max_rank), dtype=dtype).cuda() + lora_b_weights = torch.rand( + (lora_nums,hidden_size, max_rank), # col-major + dtype=dtype, + ).cuda() + + lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda() + output_tensor = torch.zeros( + total_tokens, hidden_size, dtype=dtype + ).cuda() + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + + sgmv_expand( + inputs_tensor, + lora_b_weights, + output_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + ) + torch.cuda.synchronize() + torch_out_tensor = ref_torch_groupgemm( + inputs_tensor, + lora_b_weights, + batchs, + lora_indices_tensor, + seq_len_tensor, + ) + assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2) From cdfa7c6ebda708a59a6aa7f30af8cd842f77cab9 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 27 May 2024 09:58:16 +0800 Subject: [PATCH 03/71] add unit test --- tests/lora/test_triton_sgmv.py | 326 +++++++++++++++++++++++++++++++++ vllm/lora/ops/sgmv_expand.py | 45 +++-- vllm/lora/ops/sgmv_shrink.py | 52 ++++-- vllm/lora/ops/temp_test.py | 141 -------------- vllm/lora/punica.py | 6 +- 5 files changed, 402 insertions(+), 168 deletions(-) create mode 100644 tests/lora/test_triton_sgmv.py delete mode 100644 vllm/lora/ops/temp_test.py diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py new file mode 100644 index 000000000000..5cbd40f210fb --- /dev/null +++ b/tests/lora/test_triton_sgmv.py @@ -0,0 +1,326 @@ +import random + +import pytest +import torch + +import vllm.lora.punica as punica +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_shrink import sgmv_shrink + +#The current punica kernel supports dimension and adds a dimension of 3424. +HIDDEN_SIZES = [ + 128, + 256, + 512, + 1024, + 1152, + 1280, + 1536, + 2048, + 2304, + 2560, + 2752, + 3072, + 3424, + 3456, + 3584, + 4096, + 4608, + 5120, + 5504, + 5632, + 6144, + 6848, + 6912, + 7168, + 8192, + 9216, + 10240, + 11008, + 13824, + 14336, + 15360, + 22016, + 24576, + 27392, + 27648, + 32000, + 32256, + 32512, + 32768, + 33024, + 36864, + 43264, + 49152, + 64000, + 64256, + 102400, + 102656, + 128000, + 128256, +] +BATCHS = [i for i in range(0, 64, 8)] +NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] +DTYPES = [torch.half, torch.bfloat16, torch.float32] +MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +SCALES = [0.5] +OP_TYPES = ["shrink", "expand"] +SEED = [0] +CUDA_DEVICES = [f"cuda:{0}"] + + +def assert_close(a, b): + rtol, atol = { + torch.float16: (1e-2, 1e-2), + torch.bfloat16: (12e-2, 1e-2), + torch.float32: (1e-2, 1e-2), + }[a.dtype] + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) + + +@torch.inference_mode() +def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling): + layer_idx = 0 + punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling) + return + + +def _torch_groupgemm( + out_tensor, + inputs, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, + scaling, +) -> torch.Tensor: + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batchs), seq_len_tensor): + input_weight = inputs[current_offset:b_length + current_offset, :] + current_offset += b_length + lora_weight = lora_weights[lora_indices_tensor[lora_index]] + result = torch.nn.functional.linear(input_weight, lora_weight) + result *= scaling + out_list.append(result) + out_tensor.copy_(torch.cat(out_list, dim=0)) + return + + +def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, + op_type, device): + seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + if op_type == "shrink": + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) + lora_weights = torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).to(device) + ref_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=dtype, + device=inputs_tensor.device) + # NOTE shrink kernel using torch.float32 as output type + our_out_tensor = torch.zeros( + (total_tokens, max_rank), + dtype=torch.float32, + device=inputs_tensor.device, + ) + else: + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights = torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device) + ref_out_tensor = torch.zeros( + (total_tokens, hidden_size), + dtype=dtype, + device=inputs_tensor.device, + ) + our_out_tensor = torch.zeros( + (total_tokens, hidden_size), + dtype=dtype, + device=inputs_tensor.device, + ) + + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batchs, )).to(device) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batchs): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() + current_offset += seq_len_tensor[b_id].item() + return ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + + +@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_triton_sgmv( + batchs: int, + num_loras: int, + rank: int, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + torch.manual_seed(seed) + if batchs == 0: + batchs += 1 + hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) + hidden_size = HIDDEN_SIZES[hidden_size_index] + if hidden_size > 100000: + hidden_size = hidden_size // 4 # avoid OOM + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data( + batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, + device) # The sequence length is restricted to the range [1, 1024]. + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + ) + _torch_groupgemm( + ref_out_tensor, + inputs_tensor, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, + scaling if op_type == "shrink" else 1.0, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sgmv_punica_bgmv( + hidden_size, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + batchs = 4 # Arbitrary values for testing + rank = 16 + seq_len = 333 # Arbitrary values for testing + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index b6bcca9fe8d2..354778926250 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -1,6 +1,6 @@ +import torch import triton import triton.language as tl -import torch @triton.jit @@ -25,22 +25,27 @@ def _sgmv_expand_kernel( BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr, ): + """ + The sgmv's expand triton kernel is based on GroupGEMM. + The GEMM of Multi-LoRA can be considered as GroupGEMM. + """ pid = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) cta_n_num = tl.cdiv(N, BLOCK_N) pid_m = pid // cta_n_num pid_n = pid % cta_n_num - - cur_batch = tl.program_id(axis=1) M = tl.load(seq_lens + cur_batch) if pid_m * BLOCK_M > M: return + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return cur_seq_start = tl.load(b_seq_start_loc + cur_batch) offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N offset_k = tl.arange(0, BLOCK_K) ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - lora_index = tl.load(lora_indices + cur_batch) a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride, ) @@ -89,17 +94,31 @@ def sgmv_expand( """_summary_ Args: - inputs (torch.Tensor): _description_ - lora_b_weights (torch.Tensor): _description_ - output_tensor (torch.Tensor): _description_ - b_seq_start_loc (torch.Tensor): _description_ - seq_len_tensor (torch.Tensor): _description_ - lora_indices_tensor (torch.Tensor): _description_ - batchs (int): _description_ - max_seq_length (int): _description_ + inputs (torch.Tensor): input tensor + lora_b_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4, 10]. + seq_len_tensor (torch.Tensor): (batch_size,). record the sequence + length of the sequences in the batch + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + max_seq_length (int): The max sequence lengths of the sequences + in the batch """ + assert inputs.dtype == lora_b_weights.dtype + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert inputs.size(1) == lora_b_weights.size(-1) + assert b_seq_start_loc.size(0) == batchs + assert lora_indices_tensor.size(0) == batchs + assert inputs.is_contiguous() + assert lora_b_weights.is_contiguous() + assert output_tensor.is_contiguous() + # TODO tuning this config _, N, K = lora_b_weights.shape # K= rank,N=hidden_size - BLOCK_M = 32 BLOCK_N = 32 BLOCK_K = 16 diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 595c93b89c54..d3858d91791e 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -1,6 +1,6 @@ +import torch import triton import triton.language as tl -import torch @triton.jit @@ -13,6 +13,7 @@ def _sgmv_shrink_kernel( b_seq_start_loc, seq_lens, lora_indices, + scaling, xm_stride, # hidden_size xk_stride, # 1 l0_stride, # hidden_size*max_rank @@ -26,6 +27,11 @@ def _sgmv_shrink_kernel( EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, ): + """ + The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K. + The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally, + introducing SPLIT-K can improve performance + """ pid = tl.program_id(axis=0) pid_sk = tl.program_id(axis=1) cur_batch = tl.program_id(axis=2) @@ -36,6 +42,9 @@ def _sgmv_shrink_kernel( M = tl.load(seq_lens + cur_batch) if pid_m * BLOCK_M > M: return + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return cur_seq_start = tl.load(b_seq_start_loc + cur_batch) offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N @@ -43,12 +52,11 @@ def _sgmv_shrink_kernel( ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - lora_index = tl.load(lora_indices + cur_batch) + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride) b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride + offset_k[:, None] * lora_n_stride) - accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): if EVEN_K: @@ -68,6 +76,8 @@ def _sgmv_shrink_kernel( c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < N) + accumulator *= scaling + # handles write-back with reduction-splitting if SPLIT_K == 1: tl.store(c_ptr, accumulator, mask=c_mask) else: @@ -84,19 +94,36 @@ def sgmv_shrink( lora_indices_tensor: torch.Tensor, batchs: int, max_seq_length: int, + scaling: float, ): - """_summary_ + """ Args: - inputs (torch.Tensor): _description_ - lora_a_weights (torch.Tensor): _description_ - output_tensor (torch.Tensor): _description_ - b_seq_start_loc (torch.Tensor): _description_ - seq_len_tensor (torch.Tensor): _description_ - lora_indices_tensor (torch.Tensor): _description_ - batchs (int): _description_ - max_seq_length (int): _description_ + inputs (torch.Tensor): input tensor + lora_a_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4, 10]. + seq_len_tensor (torch.Tensor): (batch_size,). record the sequence + length of the sequences in the batch + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + max_seq_length (int): The max sequence lengths of the sequences + in the batch + scaling (float): Scaling factor. """ + assert inputs.dtype == lora_a_weights.dtype + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert inputs.size(1) == lora_a_weights.size(-1) + assert b_seq_start_loc.size(0) == batchs + assert lora_indices_tensor.size(0) == batchs + assert inputs.is_contiguous() + assert lora_a_weights.is_contiguous() + assert output_tensor.is_contiguous() + # TODO tuning this config _, N, K = lora_a_weights.shape # K=hidden_size,N=rank BLOCK_M = 32 BLOCK_N = 32 @@ -118,6 +145,7 @@ def sgmv_shrink( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, + scaling, inputs.stride(0), inputs.stride(1), lora_a_weights.stride(0), diff --git a/vllm/lora/ops/temp_test.py b/vllm/lora/ops/temp_test.py deleted file mode 100644 index 79464266883b..000000000000 --- a/vllm/lora/ops/temp_test.py +++ /dev/null @@ -1,141 +0,0 @@ -import torch - -import pytest -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink - - - - -def ref_torch_groupgemm( - x_ptr, - lora_ptr, - batchs, - lora_indices_tensor, - seq_len_tensor, -) -> torch.Tensor: - out_list = [] - - current_offset = 0 - for lora_index, b_length in zip(range(batchs), seq_len_tensor): - input_weight = x_ptr[current_offset : b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_ptr[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - out_list.append(result) - out = torch.cat(out_list, dim=0) - return out - - -@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)]) -@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424]) -@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128]) -@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32]) -@torch.inference_mode() -def test_shrink_kernel(batchs, hidden_size, lora_nums, max_rank, dtype): - SEED = [0xABCDABCD987] - torch.manual_seed(SEED[0]) - if batchs == 0: - batchs += 1 - - seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda() - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).cuda() - total_tokens = seq_len_tensor.sum() - - inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).cuda() - lora_a_weights = torch.rand( - (lora_nums, max_rank, hidden_size), # col-major - dtype=dtype, - ).cuda() - - lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda() - output_tensor = torch.zeros( - total_tokens, max_rank, dtype=torch.float32 - ).cuda() - - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - - sgmv_shrink( - inputs_tensor, - lora_a_weights, - output_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - ) - torch.cuda.synchronize() - torch_out_tensor = ref_torch_groupgemm( - inputs_tensor, - lora_a_weights, - batchs, - lora_indices_tensor, - seq_len_tensor, - ) - torch_out_tensor = torch_out_tensor.to(torch.float32) - assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2) - -@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)]) -@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424]) -@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128]) -@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32]) -@torch.inference_mode() -def test_expand_kernel(batchs, hidden_size, lora_nums, max_rank, dtype): - SEED = [0xABCDABCD987] - torch.manual_seed(SEED[0]) - if batchs == 0: - batchs += 1 - - seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda() - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).cuda() - total_tokens = seq_len_tensor.sum() - - inputs_tensor = torch.rand((total_tokens, max_rank), dtype=dtype).cuda() - lora_b_weights = torch.rand( - (lora_nums,hidden_size, max_rank), # col-major - dtype=dtype, - ).cuda() - - lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda() - output_tensor = torch.zeros( - total_tokens, hidden_size, dtype=dtype - ).cuda() - - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - - sgmv_expand( - inputs_tensor, - lora_b_weights, - output_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - ) - torch.cuda.synchronize() - torch_out_tensor = ref_torch_groupgemm( - inputs_tensor, - lora_b_weights, - batchs, - lora_indices_tensor, - seq_len_tensor, - ) - assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 8957b6168304..1e6cb83f719d 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -1,9 +1,11 @@ # Based on code from https://github.com/punica-ai/punica from typing import Optional + import torch -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink + +# from vllm.lora.ops.sgmv_expand import sgmv_expand +# from vllm.lora.ops.sgmv_shrink import sgmv_shrink def _raise_import_error(e): From 2fbb2ca49f97caaeaa3fbeb819d0df3fc43ba749 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 28 May 2024 22:18:08 +0800 Subject: [PATCH 04/71] back up --- tests/lora/test_triton_sgmv.py | 61 +++--- vllm/lora/layers.py | 306 ++++++++++++++++++++++++++--- vllm/lora/models.py | 21 +- vllm/lora/ops/sgmv_expand.py | 46 ++++- vllm/lora/ops/sgmv_expand_slice.py | 207 +++++++++++++++++++ vllm/lora/ops/sgmv_shrink.py | 25 ++- vllm/lora/punica.py | 193 +++++++++++++++--- vllm/worker/model_runner.py | 37 +++- 8 files changed, 797 insertions(+), 99 deletions(-) create mode 100644 vllm/lora/ops/sgmv_expand_slice.py diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py index 5cbd40f210fb..d0903f76cd37 100644 --- a/tests/lora/test_triton_sgmv.py +++ b/tests/lora/test_triton_sgmv.py @@ -7,7 +7,7 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_shrink import sgmv_shrink -#The current punica kernel supports dimension and adds a dimension of 3424. +# The current punica kernel supports dimension and adds a dimension of 3424. HIDDEN_SIZES = [ 128, 256, @@ -93,6 +93,7 @@ def _torch_groupgemm( seq_len_tensor, batchs, scaling, + op_type, ) -> torch.Tensor: out_list = [] current_offset = 0 @@ -103,7 +104,11 @@ def _torch_groupgemm( result = torch.nn.functional.linear(input_weight, lora_weight) result *= scaling out_list.append(result) - out_tensor.copy_(torch.cat(out_list, dim=0)) + cat_result = torch.cat(out_list, dim=0) + if op_type == "expand": + out_tensor += cat_result + else: + out_tensor.copy_(cat_result) return @@ -122,6 +127,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, (lora_nums, max_rank, hidden_size), # col-major dtype=dtype, ).to(device) + # shrink op need atomic_add, so output is initinized by 0 ref_out_tensor = torch.zeros((total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device) @@ -132,6 +138,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, device=inputs_tensor.device, ) else: + inputs_tensor = torch.rand( (total_tokens, max_rank), dtype=dtype, @@ -140,16 +147,15 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, (lora_nums, hidden_size, max_rank), # col-major dtype=dtype, ).to(device) - ref_out_tensor = torch.zeros( - (total_tokens, hidden_size), - dtype=dtype, - device=inputs_tensor.device, - ) - our_out_tensor = torch.zeros( + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand( (total_tokens, hidden_size), dtype=dtype, device=inputs_tensor.device, ) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() lora_indices_tensor = torch.randint(0, lora_nums - 1 if lora_nums > 1 else 1, @@ -181,7 +187,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, @pytest.mark.parametrize("op_type", OP_TYPES) @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) -def test_triton_sgmv( +def test_sgmv_torch( batchs: int, num_loras: int, rank: int, @@ -228,25 +234,18 @@ def test_triton_sgmv( scaling, ) else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - ) - _torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batchs, - scaling if op_type == "shrink" else 1.0, - ) + sgmv_expand(inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + add_inputs=True) + _torch_groupgemm(ref_out_tensor, inputs_tensor, lora_weights, + lora_indices_tensor, seq_len_tensor, batchs, + scaling if op_type == "shrink" else 1.0, op_type) if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -285,6 +284,7 @@ def test_sgmv_punica_bgmv( indices, ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device) + max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() @@ -312,6 +312,7 @@ def test_sgmv_punica_bgmv( lora_indices_tensor, batchs, max_seq_length, + add_inputs=True, ) lora_weights_4d = lora_weights.unsqueeze(dim=1) _punica_bgmv( @@ -324,3 +325,7 @@ def test_sgmv_punica_bgmv( if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) + + +# if __name__ == "__main__": +# pytest.main(["test_triton_sgmv.py::test_sgmv_torch"]) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 24b74476c3b8..5e4f648f3788 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,6 +1,6 @@ # pylint: disable=unused-argument import math -from dataclasses import dataclass +from dataclasses import dataclass,field from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch @@ -16,7 +16,9 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -from vllm.lora.punica import add_lora, add_lora_slice, bgmv +from vllm.lora.punica import (add_lora, add_lora_triton, add_lora_slice, + add_lora_triton_slice, bgmv) +from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -88,8 +90,47 @@ def _apply_lora( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) indices = indices.view(-1) - add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) - return output.view_as(org_output) + buffer = add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, + 1.0) + return buffer, output.view_as(org_output) + + +def _apply_lora_triton( + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + output: torch.Tensor, +): + # """Applies lora to each input. + + # This method applies all loras to each input. It uses the + # indices vector to determine which lora yields the + # correct output. An index of -1 means no lora should be + # applied. This method adds the final lora results to the + # output. + + # Input shapes: + # x: (batch_size, hidden_dim) + # lora_a_stacked: (num_loras, lora_rank, hidden_dim) + # lora_b_stacked: (num_loras, output_dim, lora_rank) + # indices: (batch_size) + # output: (batch_size, output_dim) + # """ + org_output = output + x = x.view(-1, x.shape[-1]) + output = output.view(-1, output.shape[-1]) + + batch_size = batch_mlength_lst[0] + max_length = batch_mlength_lst[1] + + buffer = add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, + b_seq_start_tensor, seq_length_tensor, + lora_index_tensor, batch_size, max_length, 0, 1.0) + return buffer, output.view_as(org_output) def _apply_lora_packed_nslice( @@ -133,12 +174,64 @@ def _apply_lora_packed_nslice( return output.view_as(org_output) +def _apply_lora_triton_nslice( + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + output: torch.Tensor, + output_slices: Tuple[int, ...], +): + # """Applies lora to each input. + + # This method applies all loras to each input. It uses the + # indices vector to determine which lora yields the + # correct output. An index of -1 means no lora should be + # applied. This method adds the final lora results to the + # output. + + # Input shapes: + # x: (batch_size, hidden_dim) + # lora_a_stacked: (num_loras, lora_rank, hidden_dim) + # lora_b_stacked: (num_loras, output_dim, lora_rank) + # indices: (batch_size) + # output: (batch_size, output_dim) + # """ + org_output = output + x = x.view(-1, x.shape[-1]) + output = output.view(-1, output.shape[-1]) + + batch_size = batch_mlength_lst[0] + max_length = batch_mlength_lst[1] + + offset_left = 0 + #TODO fuse these kernel + for slice_idx in range(len(output_slices)): + add_lora_triton_slice(output, x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], b_seq_start_tensor, + seq_length_tensor, lora_index_tensor, batch_size, + max_length, 0, 1.0, offset_left, + output_slices[slice_idx]) + offset_left += output_slices[slice_idx] + + return output.view_as(org_output) + + @dataclass class LoRAMapping: # Per every token in input_ids: index_mapping: Tuple[int, ...] # Per sampled token: prompt_mapping: Tuple[int, ...] + # Per batch lora index + batch_mapping: List[int]=field(default_factory=list) + # Per batch seq length + seq_lens: List[int]=field(default_factory=list) + # prefilling or decoding. + is_prefilling: bool=False def __post_init__(self): self.index_mapping = tuple(self.index_mapping) @@ -193,6 +286,13 @@ def set_mapping( """Sets the mapping indices.""" ... + def set_kernel_mapping(self, seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int]): + """Sets the kernel mapping""" + ... + @classmethod def can_replace_layer(cls, source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: List, @@ -270,6 +370,11 @@ def create_lora_weights( self.indices_len: List[int] self.embeddings_indices: torch.Tensor + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -316,6 +421,18 @@ def set_mapping( self.embeddings_indices = embeddings_indices self.indices_len = indices_len + def set_kernel_mapping( + self, + seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + ): + self.seq_length_tensor = seq_length_tensor + self.b_seq_start_tensor = b_seq_start_tensor + self.lora_index_tensor = lora_index_tensor + self.batch_mlength_list = batch_mlength_lst + def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 embedding_len = self.indices_len[3] @@ -336,8 +453,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) - bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + batchs, max_length = self.batch_mlength_list[ + 0], self.batch_mlength_list[1] + + sgmv_expand( + full_lora_a_embeddings, + self.lora_b_stacked, + full_output, + self.b_seq_start_tensor[:batchs], + self.seq_length_tensor[:batchs], + self.lora_index_tensor[:batchs], + batchs, + max_length, + True, + ) return full_output.view_as(full_output_org) @classmethod @@ -393,6 +522,10 @@ def create_lora_weights( # lazily initialized. self.indices: torch.Tensor self.indices_len: List[int] + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -441,16 +574,28 @@ def set_mapping( self.indices = base_indices self.indices_len = indices_len + def set_kernel_mapping( + self, + seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + ): + self.seq_length_tensor = seq_length_tensor + self.b_seq_start_tensor = b_seq_start_tensor + self.lora_index_tensor = lora_index_tensor + self.batch_mlength_list = batch_mlength_lst + def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - ) + batch_size = self.batch_mlength_list[0] + # maybe we need not restrict range to [:batch_size] + _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, + self.b_seq_start_tensor[:batch_size], + self.seq_length_tensor[:batch_size], + self.lora_index_tensor[:batch_size], + self.batch_mlength_list, output) return output def forward(self, input_): @@ -542,6 +687,11 @@ def create_lora_weights( # Lazily initialized. self.indices: torch.Tensor + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 self.lora_a_stacked[1][index] = 0 @@ -597,14 +747,32 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_packed_nslice( + # output_temp=output.clone() + # _apply_lora_packed_nslice( + # x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.indices[:self.indices_len[0]], + # output, + # (self.output_dim, self.output_dim), + # ) + batchs = self.batch_mlength_list[0] + _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.indices[:self.indices_len[0]], + self.b_seq_start_tensor[:batchs], + self.seq_length_tensor[:batchs], + self.lora_index_tensor[:batchs], + self.batch_mlength_list, output, (self.output_dim, self.output_dim), ) + # flag=torch.allclose(output,output_temp,1e-2,1e-2) + # if flag: + # print("pass") + # else: + # print() return output @classmethod @@ -774,6 +942,11 @@ def create_lora_weights( # lazily initialized. self.indices_len: List[int] + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 self.lora_b_stacked[0][index] = 0 @@ -851,14 +1024,27 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_packed_nslice( + # _apply_lora_packed_nslice( + # x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.indices[:self.indices_len[0]], + # output, + # self.output_slices, + # ) + batchs = self.batch_mlength_list[0] + _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.indices[:self.indices_len[0]], + self.b_seq_start_tensor[:batchs], + self.seq_length_tensor[:batchs], + self.lora_index_tensor[:batchs], + self.batch_mlength_list, output, self.output_slices, ) + return output @classmethod @@ -915,6 +1101,11 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: List[int] + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -962,17 +1153,56 @@ def set_mapping( self.indices = base_indices self.indices_len = indices_len + def set_kernel_mapping( + self, + seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + ): + self.seq_length_tensor = seq_length_tensor + self.b_seq_start_tensor = b_seq_start_tensor + self.lora_index_tensor = lora_index_tensor + self.batch_mlength_list = batch_mlength_lst + def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - ) + batch_size = self.batch_mlength_list[0] + # maybe we need not restrict range to [:batch_size] + _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, + self.b_seq_start_tensor[:batch_size], + self.seq_length_tensor[:batch_size], + self.lora_index_tensor[:batch_size], + self.batch_mlength_list, output) return output + # def apply(self, x: torch.Tensor) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x) + # temp_output = output.clone() + # output2 = output.clone() + # mid_buffer,_=_apply_lora( + # x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.indices[:self.indices_len[0]], + # output, + # ) + # batch_size = self.batch_mlength_list[0] + # # print(f"self.indices[:self.indices_len[0]]={ self.indices[:self.indices_len[0]]},\ + # # lora_index_tensor={self.lora_index_tensor[:batch_size]},batch={self.batch_mlength_list[0]}") + # # # + # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, + # self.b_seq_start_tensor[:batch_size], + # self.seq_length_tensor[:batch_size], + # self.lora_index_tensor[:batch_size], + # self.batch_mlength_list, output) + # flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2) + # # if not flag: + # # print("error") + # # else: + # # print("pass") + # return temp_output + def forward(self, input_): """Forward of RowParallelLinear @@ -1103,6 +1333,11 @@ def create_lora_weights( self.indices_len: List[int] self.indices_padded: torch.Tensor + self.seq_length_tensor: torch.Tensor + self.b_seq_start_tensor: torch.Tensor + self.lora_index_tensor: torch.Tensor + self.batch_mlength_list: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -1140,6 +1375,18 @@ def set_mapping( self.indices_padded = sampler_indices_padded self.indices_len = indices_len + def set_kernel_mapping( + self, + seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + lora_index_tensor: torch.Tensor, + batch_mlength_lst: List[int], + ): + self.seq_length_tensor = seq_length_tensor + self.b_seq_start_tensor = b_seq_start_tensor + self.lora_index_tensor = lora_index_tensor + self.batch_mlength_list = batch_mlength_lst + def _get_logits( self, hidden_states: torch.Tensor, @@ -1186,6 +1433,17 @@ def _get_logits( logits, ) + # batch_size=self.batch_mlength_list[0] + # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked, + # self.b_seq_start_tensor[:batch_size], + # self.seq_length_tensor[:batch_size], + # self.indices[:self.indices_len[1]], + # self.batch_mlength_list, logits_temp) + # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2) + # if flag: + # print("pass") + # else: + # print("error") # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3e82856866d8..392b8b4a6c51 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -81,7 +81,7 @@ def convert_mapping( embeddings_indices, long_lora_indices). If long_lora doesn't exist, it only contains first 4 entries. """ - index_mapping_indices: List[int] = list(mapping.index_mapping).copy() + index_mapping_indices: List[int] = list(mapping.batch_mapping).copy() embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None @@ -427,6 +427,19 @@ def __init__( # Dict instead of a Set for compatibility with LRUCache. self._active_loras: Dict[int, None] = {} self._last_mapping: Optional[LoRAMapping] = None + + # triton kernel mapping + + self.batch_mlength_lst = [-1] * 2 + self.seq_length_tensor = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.b_seq_start_tensor = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.lora_index_tensor = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") self._create_lora_modules() self.model.lora_manager = self @@ -548,6 +561,8 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: # Maintain the reference self.indices_len[:] = indices_len + + def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: self._set_lora_mapping(lora_mapping) @@ -600,6 +615,10 @@ def _create_lora_modules(self): self.sampler_indices_padded, self.embeddings_indices, self.long_lora_indices, self.indices_len) + new_module.set_kernel_mapping(self.seq_length_tensor, + self.b_seq_start_tensor, + self.lora_index_tensor, + self.batch_mlength_lst) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 354778926250..c68c551db89e 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -24,10 +24,11 @@ def _sgmv_expand_kernel( BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, ): """ The sgmv's expand triton kernel is based on GroupGEMM. - The GEMM of Multi-LoRA can be considered as GroupGEMM. """ pid = tl.program_id(axis=0) cur_batch = tl.program_id(axis=1) @@ -63,13 +64,16 @@ def _sgmv_expand_kernel( tiled_b = tl.load(b_ptr, mask=offset_k[:, None] < K - k * BLOCK_K, other=0) + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + #TODO Can I use D=A@B+C ? accumulator += tl.dot( tiled_a, tiled_b, ) a_ptr += BLOCK_K * xk_stride b_ptr += BLOCK_K * lora_n_stride - tiled_c = accumulator.to(input_ptr.dtype.element_ty) + tiled_c = accumulator.to(lora_ptr.dtype.element_ty) offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + @@ -77,6 +81,9 @@ def _sgmv_expand_kernel( M = tl.load(seq_lens + cur_batch) c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < N) + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + tiled_c += tiled_out tl.store(c_ptr, tiled_c, mask=c_mask) @@ -90,9 +97,11 @@ def sgmv_expand( lora_indices_tensor: torch.Tensor, batchs: int, max_seq_length: int, + add_inputs: bool = False, ): """_summary_ + Args: inputs (torch.Tensor): input tensor lora_b_weights (torch.Tensor): lora'a weight @@ -108,27 +117,48 @@ def sgmv_expand( batchs (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch + add_inputs (bool, optional): _description_. Defaults to False. + cast_type (bool, optional): _description_. Defaults to False. """ - assert inputs.dtype == lora_b_weights.dtype + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + torch.float32, + ] assert inputs.size(1) == lora_b_weights.size(-1) assert b_seq_start_loc.size(0) == batchs assert lora_indices_tensor.size(0) == batchs assert inputs.is_contiguous() - assert lora_b_weights.is_contiguous() assert output_tensor.is_contiguous() + + if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_b_weights.size(1) == 1 + lora_b_weights = lora_b_weights.squeeze(dim=1) + else: + assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) + + assert lora_b_weights.is_contiguous() + # TODO tuning this config - _, N, K = lora_b_weights.shape # K= rank,N=hidden_size + + N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size BLOCK_M = 32 BLOCK_N = 32 BLOCK_K = 16 EVEN_K = K % BLOCK_K == 0 - + ADD_INPUTS = add_inputs + CAST_TYPE = False + if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), batchs, ] - _sgmv_expand_kernel[grid]( inputs, lora_b_weights, @@ -149,5 +179,7 @@ def sgmv_expand( BLOCK_N, BLOCK_K, EVEN_K, + ADD_INPUTS, + CAST_TYPE, ) return diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py new file mode 100644 index 000000000000..a8d93aa196a2 --- /dev/null +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -0,0 +1,207 @@ +import torch +import triton +import triton.language as tl + + +@triton.jit +def _sgmv_expand_slice_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + b_seq_start_loc, + seq_lens, + lora_indices, + xm_stride, + xk_stride, # 1 + l0_stride, # hidden_size*max_rank + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + slice_offset, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, +): + """ + + Similar to the 'sgmv_expand' operator, but with an added parameter + 'slice_offset'. The reason for not reusing the 'sgmv_expand' operator + might be that in the future, we could implement a fusion operator to + achieve the current functionality instead of having to call it multiple + times. + """ + pid = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) + cta_n_num = tl.cdiv(N, BLOCK_N) + pid_m = pid // cta_n_num + pid_n = pid % cta_n_num + M = tl.load(seq_lens + cur_batch) + if pid_m * BLOCK_M > M: + return + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return + cur_seq_start = tl.load(b_seq_start_loc + cur_batch) + offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = tl.arange(0, BLOCK_K) + ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + + a_ptr = ( + input_ptr + + cur_seq_start * xm_stride + + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride, + ) + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + offset_k[:, None] * lora_n_stride + + rbn[None, :] * lora_k_stride + ) + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(tl.cdiv(K, BLOCK_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + tiled_a = tl.load( + a_ptr, mask=offset_k[None, :] < K - k * BLOCK_K, other=0 + ) + tiled_b = tl.load( + b_ptr, mask=offset_k[:, None] < K - k * BLOCK_K, other=0 + ) + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + # TODO Can I use D=A@B+C ? + accumulator += tl.dot( + tiled_a, + tiled_b, + ) + a_ptr += BLOCK_K * xk_stride + b_ptr += BLOCK_K * lora_n_stride + tiled_c = accumulator.to(lora_ptr.dtype.element_ty) + offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset + c_ptr = ( + out_ptr + + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride + ) + M = tl.load(seq_lens + cur_batch) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < (slice_offset+N) + ) + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + tiled_c += tiled_out + tl.store(c_ptr, tiled_c, mask=c_mask) + + +@torch.inference_mode() +def sgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + max_seq_length: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False, +): + """_summary_ + + Args: + inputs (torch.Tensor): input tensor + lora_b_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4, 10]. + seq_len_tensor (torch.Tensor): (batch_size,). record the sequence + length of the sequences in the batch + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + max_seq_length (int): The max sequence lengths of the sequences + in the batch + slice_offst (int): output_tensor's offst + slice_size (int): current output_tensor's size + add_inputs (bool, optional): _description_. Defaults to False. + """ + + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + torch.float32, + ] + assert inputs.size(1) == lora_b_weights.size(-1) + assert b_seq_start_loc.size(0) == batchs + assert lora_indices_tensor.size(0) == batchs + assert slice_size==lora_b_weights.size(-2) + assert inputs.is_contiguous() + assert output_tensor.is_contiguous() + + if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_b_weights.size(1) == 1 + lora_b_weights = lora_b_weights.squeeze(dim=1) + else: + assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) + + assert lora_b_weights.is_contiguous() + + # TODO tuning this config + N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size + + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_K = 16 + EVEN_K = K % BLOCK_K == 0 + ADD_INPUTS = add_inputs + CAST_TYPE = False + if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True + grid = [ + triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + batchs, + ] + _sgmv_expand_slice_kernel[grid]( + inputs, + lora_b_weights, + output_tensor, + N, + K, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + inputs.stride(0), + inputs.stride(1), + lora_b_weights.stride(0), + lora_b_weights.stride(1), + lora_b_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + slice_offset, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + ADD_INPUTS, + CAST_TYPE, + ) + return diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index d3858d91791e..1b7cf0f3caa6 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -28,8 +28,8 @@ def _sgmv_shrink_kernel( SPLIT_K: tl.constexpr, ): """ - The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K. - The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally, + The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K. + The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally, introducing SPLIT-K can improve performance """ pid = tl.program_id(axis=0) @@ -43,8 +43,6 @@ def _sgmv_shrink_kernel( if pid_m * BLOCK_M > M: return lora_index = tl.load(lora_indices + cur_batch) - if lora_index == -1: - return cur_seq_start = tl.load(b_seq_start_loc + cur_batch) offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N @@ -57,6 +55,7 @@ def _sgmv_shrink_kernel( offset_k[None, :] * xk_stride) b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride + offset_k[:, None] * lora_n_stride) + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): if EVEN_K: @@ -67,13 +66,14 @@ def _sgmv_shrink_kernel( a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0) b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0) accumulator += tl.dot(a, b) + a_ptr += BLOCK_K * SPLIT_K * xk_stride b_ptr += BLOCK_K * SPLIT_K * lora_n_stride offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + offset_cn[None, :] * cn_stride) - c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < N) accumulator *= scaling @@ -121,16 +121,21 @@ def sgmv_shrink( assert b_seq_start_loc.size(0) == batchs assert lora_indices_tensor.size(0) == batchs assert inputs.is_contiguous() + + if lora_a_weights.ndim == 4: # shape:(lora_num,1,rank, size) + assert lora_a_weights.size(1) == 1 + lora_a_weights = lora_a_weights.squeeze(dim=1) + else: + assert lora_a_weights.ndim == 3 # shape:(lora_num,rank, size) assert lora_a_weights.is_contiguous() assert output_tensor.is_contiguous() # TODO tuning this config - _, N, K = lora_a_weights.shape # K=hidden_size,N=rank + N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank BLOCK_M = 32 - BLOCK_N = 32 + BLOCK_N = 16 BLOCK_K = 32 - SPLIT_K = 8 - EVEN_K = K % (SPLIT_K * BLOCK_K) == 0 - + SPLIT_K = 1 + EVEN_K = False grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), SPLIT_K, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 1e6cb83f719d..fe7319f93b96 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,8 +4,9 @@ import torch -# from vllm.lora.ops.sgmv_expand import sgmv_expand -# from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice def _raise_import_error(e): @@ -52,10 +53,16 @@ def bgmv( punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) -def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor, - w_t_all: torch.Tensor, indicies: torch.LongTensor, - layer_idx: int, scale: float, y_offset: int, - y_slice_size: int): +def dispatch_bgmv_low_level( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, +): """ Same as `bgmv` but you can operate on slices of y. Pass whole y, define y_offset and y_slice_size. @@ -95,15 +102,17 @@ def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor, ) -def add_lora(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None): +def add_lora( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None, +): """ Semantics: y[i] += ( @@ -141,19 +150,70 @@ def add_lora(y: torch.Tensor, punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale) + return buffer -def add_lora_slice(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None): +def add_lora_triton( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batch_size: int, + max_length: int, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None, +): + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + sgmv_shrink( + x, + wa_t_all, + buffer, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + scale, + ) + sgmv_expand( + buffer, + wb_t_all, + y, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + add_inputs=True, + ) + return buffer + + +def add_lora_slice( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None, +): """ Same as `add_lora` but you can operate on slices of y. Pass whole y, define y_offset and y_slice_size. @@ -214,3 +274,84 @@ def add_lora_slice(y: torch.Tensor, y_slice_size, y_offset, ) + + +def add_lora_triton_slice( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batch_size: int, + max_length: int, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None, +): + """ + Same as `add_lora` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + y_offset: Offset to apply to the starting column of y. + y_slice_size: Size of the y column slice. + # """ + # try: + # import vllm._punica_C as punica_kernels + # except ImportError as e: + # _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + sgmv_shrink( + x, + wa_t_all, + buffer, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + scale, + ) + sgmv_expand_slice( + buffer, + wb_t_all, + y, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + y_offset, + y_slice_size, + add_inputs=True, + ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 87d5f5c1b9d6..d6713b59944e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -233,6 +233,7 @@ def _prepare_model_input( input_positions: List[int] = [] slot_mapping: List[int] = [] lora_index_mapping: List[int] = [] + batch_lora_index_mapping: List[int] = [] lora_prompt_mapping: List[int] = [] lora_requests: Set[LoRARequest] = set() @@ -386,6 +387,7 @@ def _prepare_model_input( lora_requests.add(seq_group_metadata.lora_request) lora_index_mapping += [lora_id] * (seq_len - context_len) + batch_lora_index_mapping += [lora_id if lora_id > 0 else -1] lora_prompt_mapping.extend( [lora_id] * (seq_len - @@ -586,9 +588,9 @@ def _prepare_model_input( if self.lora_config: lora_mapping = LoRAMapping( - lora_index_mapping, - lora_prompt_mapping, - ) + lora_index_mapping, lora_prompt_mapping, + batch_lora_index_mapping, query_lens, + bool(attn_metadata.prefill_metadata)) else: lora_mapping = None @@ -788,6 +790,32 @@ def profile_run(self) -> None: torch.cuda.synchronize() return + # def compose_lora_kernel_meta( + # self, + # attn_metadata: AttentionMetadata, + # ) -> LoRAKernelMeta: + # if attn_metadata.prefill_metadata: + # max_seq_len = attn_metadata.max_query_len + # seq_start_loc = attn_metadata.query_start_loc + # seq_lens_tensor = attn_metadata.seq_lens_tensor + # batch_size = attn_metadata.num_prefills + # else: + # max_seq_len = attn_metadata.max_query_len + # seq_start_loc = attn_metadata.query_start_loc + # batch_size = attn_metadata.decode_metadata.num_decode_tokens + # seq_lens_tensor = torch.ones((batch_size), + # dtype=torch.long, + # device=self.device) + + # if batch_size == 0: + # print("sssss") + # # lora_index_lst = lora_mapping.batch_mapping + # # lora_index_tensor = torch.tensor(lora_index_lst, + # # dtype=torch.long, + # # device=self.device) + # return LoRAKernelMeta(batch_size, max_seq_len, seq_lens_tensor, + # seq_start_loc) + def remove_all_loras(self): if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -881,6 +909,9 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: lora_mapping = LoRAMapping( [0] * batch_size, [0] * batch_size, + [0] * batch_size, + [1] * batch_size, + False ) self.set_active_loras(set(), lora_mapping) From fad4b033cf7a49ce5a5902741feec96742044e87 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 29 May 2024 00:33:54 +0800 Subject: [PATCH 05/71] start replacing bgmv --- vllm/lora/layers.py | 18 +++++------ vllm/lora/models.py | 34 ++++++++++++++++++-- vllm/lora/ops/sgmv_expand_slice.py | 50 ++++++++++++------------------ vllm/lora/ops/sgmv_shrink.py | 2 +- vllm/lora/punica.py | 2 -- vllm/worker/model_runner.py | 18 +++++------ 6 files changed, 66 insertions(+), 58 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 5e4f648f3788..68127fd5fe61 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,6 +1,6 @@ # pylint: disable=unused-argument import math -from dataclasses import dataclass,field +from dataclasses import dataclass, field from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch @@ -127,10 +127,10 @@ def _apply_lora_triton( batch_size = batch_mlength_lst[0] max_length = batch_mlength_lst[1] - buffer = add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, + add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, b_seq_start_tensor, seq_length_tensor, lora_index_tensor, batch_size, max_length, 0, 1.0) - return buffer, output.view_as(org_output) + return output.view_as(org_output) def _apply_lora_packed_nslice( @@ -227,11 +227,11 @@ class LoRAMapping: # Per sampled token: prompt_mapping: Tuple[int, ...] # Per batch lora index - batch_mapping: List[int]=field(default_factory=list) + batch_mapping: List[int] = field(default_factory=list) # Per batch seq length - seq_lens: List[int]=field(default_factory=list) + seq_lens: List[int] = field(default_factory=list) # prefilling or decoding. - is_prefilling: bool=False + is_prefilling: bool = False def __post_init__(self): self.index_mapping = tuple(self.index_mapping) @@ -1188,10 +1188,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # output, # ) # batch_size = self.batch_mlength_list[0] - # # print(f"self.indices[:self.indices_len[0]]={ self.indices[:self.indices_len[0]]},\ - # # lora_index_tensor={self.lora_index_tensor[:batch_size]},batch={self.batch_mlength_list[0]}") - # # # - # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, + # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, + # self.lora_b_stacked, # self.b_seq_start_tensor[:batch_size], # self.seq_length_tensor[:batch_size], # self.lora_index_tensor[:batch_size], diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 392b8b4a6c51..96e2e51bd93e 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -81,7 +81,7 @@ def convert_mapping( embeddings_indices, long_lora_indices). If long_lora doesn't exist, it only contains first 4 entries. """ - index_mapping_indices: List[int] = list(mapping.batch_mapping).copy() + index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None @@ -434,7 +434,7 @@ def __init__( self.seq_length_tensor = torch.empty(self.max_num_batched_tokens, dtype=torch.long, device="cuda") - self.b_seq_start_tensor = torch.empty(self.max_num_batched_tokens, + self.b_seq_start_tensor = torch.zeros(self.max_num_batched_tokens, dtype=torch.long, device="cuda") self.lora_index_tensor = torch.empty(self.max_num_batched_tokens, @@ -561,7 +561,35 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: # Maintain the reference self.indices_len[:] = indices_len - + if mapping.seq_lens: + batchs = len(mapping.seq_lens) + seq_length_tensor = torch.tensor(mapping.seq_lens, + dtype=torch.long, + device="cuda") + self.seq_length_tensor[:batchs].copy_(seq_length_tensor) + # b_seq_start_tensor = torch.zeros(seq_length_tensor.shape[0] + 1, + # dtype=torch.long, + # device="cuda") + # torch.cumsum(seq_length_tensor, + # dim=0, + # dtype=seq_length_tensor.dtype, + # out=b_seq_start_tensor[1:]) + torch.cumsum(seq_length_tensor, + dim=0, + dtype=seq_length_tensor.dtype, + out=self.b_seq_start_tensor[1:]) + # self.b_seq_start_tensor[:batchs].copy_(b_seq_start_tensor) + lora_id_lst = [] + for lora_index in mapping.batch_mapping: + lora_id_lst.append( + self.lora_index_to_id.index(lora_index + ) if lora_index > 0 else -1) + lora_id_tensor = torch.tensor(lora_id_lst, + dtype=torch.long, + device="cuda") + self.lora_index_tensor[:lora_id_tensor.size(0)].copy_( + lora_id_tensor) + self.batch_mlength_lst[:] = [batchs, max(mapping.seq_lens)] def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index a8d93aa196a2..b0bf8015431e 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -54,30 +54,22 @@ def _sgmv_expand_slice_kernel( ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - a_ptr = ( - input_ptr - + cur_seq_start * xm_stride - + ram[:, None] * xm_stride - + offset_k[None, :] * xk_stride, - ) - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + offset_k[:, None] * lora_n_stride - + rbn[None, :] * lora_k_stride - ) + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride, ) + b_ptr = (lora_ptr + l0_stride * lora_index + + offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride) accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(tl.cdiv(K, BLOCK_K)): if EVEN_K: tiled_a = tl.load(a_ptr) tiled_b = tl.load(b_ptr) else: - tiled_a = tl.load( - a_ptr, mask=offset_k[None, :] < K - k * BLOCK_K, other=0 - ) - tiled_b = tl.load( - b_ptr, mask=offset_k[:, None] < K - k * BLOCK_K, other=0 - ) + tiled_a = tl.load(a_ptr, + mask=offset_k[None, :] < K - k * BLOCK_K, + other=0) + tiled_b = tl.load(b_ptr, + mask=offset_k[:, None] < K - k * BLOCK_K, + other=0) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # TODO Can I use D=A@B+C ? @@ -89,16 +81,12 @@ def _sgmv_expand_slice_kernel( b_ptr += BLOCK_K * lora_n_stride tiled_c = accumulator.to(lora_ptr.dtype.element_ty) offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset - c_ptr = ( - out_ptr - + offset_cm[:, None] * cm_stride - + offset_cn[None, :] * cn_stride - ) + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset + c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride) M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( - offset_cn[None, :] < (slice_offset+N) - ) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < + (slice_offset + N)) if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) tiled_c += tiled_out @@ -150,7 +138,7 @@ def sgmv_expand_slice( assert inputs.size(1) == lora_b_weights.size(-1) assert b_seq_start_loc.size(0) == batchs assert lora_indices_tensor.size(0) == batchs - assert slice_size==lora_b_weights.size(-2) + assert slice_size == lora_b_weights.size(-2) assert inputs.is_contiguous() assert output_tensor.is_contiguous() @@ -158,7 +146,7 @@ def sgmv_expand_slice( assert lora_b_weights.size(1) == 1 lora_b_weights = lora_b_weights.squeeze(dim=1) else: - assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) + assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) assert lora_b_weights.is_contiguous() @@ -172,8 +160,8 @@ def sgmv_expand_slice( ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 1b7cf0f3caa6..b8d0d8a23c8c 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -134,7 +134,7 @@ def sgmv_shrink( BLOCK_M = 32 BLOCK_N = 16 BLOCK_K = 32 - SPLIT_K = 1 + SPLIT_K = 16 EVEN_K = False grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index fe7319f93b96..4f4fccca8051 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -150,7 +150,6 @@ def add_lora( punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale) - return buffer def add_lora_triton( @@ -198,7 +197,6 @@ def add_lora_triton( max_length, add_inputs=True, ) - return buffer def add_lora_slice( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index d6713b59944e..aaa8a66c40ab 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -587,10 +587,9 @@ def _prepare_model_input( ) if self.lora_config: - lora_mapping = LoRAMapping( - lora_index_mapping, lora_prompt_mapping, - batch_lora_index_mapping, query_lens, - bool(attn_metadata.prefill_metadata)) + lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping, + batch_lora_index_mapping, query_lens, + bool(attn_metadata.prefill_metadata)) else: lora_mapping = None @@ -906,13 +905,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: ) if self.lora_config: - lora_mapping = LoRAMapping( - [0] * batch_size, - [0] * batch_size, - [0] * batch_size, - [1] * batch_size, - False - ) + lora_mapping = LoRAMapping([0] * batch_size, + [0] * batch_size, + [0] * batch_size, + [1] * batch_size, False) self.set_active_loras(set(), lora_mapping) graph_runner = CUDAGraphRunner(self.model) From 40d449abb6992662d237da51773a50376688c3a9 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 29 May 2024 17:23:36 +0800 Subject: [PATCH 06/71] backup --- tests/lora/test_triton_sgmv.py | 69 ++++++++- vllm/lora/layers.py | 220 +++++++++++------------------ vllm/lora/models.py | 95 +++++++------ vllm/lora/ops/sgmv_expand.py | 7 + vllm/lora/ops/sgmv_expand_slice.py | 7 + vllm/lora/ops/sgmv_shrink.py | 7 + 6 files changed, 215 insertions(+), 190 deletions(-) diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py index d0903f76cd37..db3739f35d24 100644 --- a/tests/lora/test_triton_sgmv.py +++ b/tests/lora/test_triton_sgmv.py @@ -6,6 +6,7 @@ import vllm.lora.punica as punica from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice # The current punica kernel supports dimension and adds a dimension of 3424. HIDDEN_SIZES = [ @@ -327,5 +328,69 @@ def test_sgmv_punica_bgmv( assert_close(our_out_tensor, ref_out_tensor) -# if __name__ == "__main__": -# pytest.main(["test_triton_sgmv.py::test_sgmv_torch"]) +@pytest.mark.skip("TODO") +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sgmv_expand_nslice( + hidden_size, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + batchs = 4 # Arbitrary values for testing + rank = 16 + seq_len = 333 # Arbitrary values for testing + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + + sgmv_expand_slice( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + 1024, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + + assert_close(our_out_tensor, ref_out_tensor) + + +if __name__ == "__main__": + pytest.main(["test_triton_sgmv.py::test_sgmv_expand_nslice"]) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 68127fd5fe61..a3a40ad0bd24 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -102,7 +102,7 @@ def _apply_lora_triton( b_seq_start_tensor: torch.Tensor, seq_length_tensor: torch.Tensor, lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], + batch_mlen_stage_lst: List[int], output: torch.Tensor, ): # """Applies lora to each input. @@ -124,13 +124,13 @@ def _apply_lora_triton( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - batch_size = batch_mlength_lst[0] - max_length = batch_mlength_lst[1] + batch_size = batch_mlen_stage_lst[0] + max_length = batch_mlen_stage_lst[1] add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, - b_seq_start_tensor, seq_length_tensor, - lora_index_tensor, batch_size, max_length, 0, 1.0) - return output.view_as(org_output) + b_seq_start_tensor, seq_length_tensor, lora_index_tensor, + batch_size, max_length, 0, 1.0) + return output.view_as(org_output) def _apply_lora_packed_nslice( @@ -181,7 +181,7 @@ def _apply_lora_triton_nslice( b_seq_start_tensor: torch.Tensor, seq_length_tensor: torch.Tensor, lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], + batch_mlen_stage_lst: List[int], output: torch.Tensor, output_slices: Tuple[int, ...], ): @@ -204,8 +204,8 @@ def _apply_lora_triton_nslice( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - batch_size = batch_mlength_lst[0] - max_length = batch_mlength_lst[1] + batch_size = batch_mlen_stage_lst[0] + max_length = batch_mlen_stage_lst[1] offset_left = 0 #TODO fuse these kernel @@ -275,24 +275,14 @@ def set_lora( ... def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): """Sets the mapping indices.""" ... - def set_kernel_mapping(self, seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int]): - """Sets the kernel mapping""" - ... - @classmethod def can_replace_layer(cls, source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: List, @@ -372,8 +362,7 @@ def create_lora_weights( self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -409,29 +398,17 @@ def set_lora( self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): self.indices = base_indices self.embeddings_indices = embeddings_indices self.indices_len = indices_len - - def set_kernel_mapping( - self, - seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], - ): self.seq_length_tensor = seq_length_tensor self.b_seq_start_tensor = b_seq_start_tensor - self.lora_index_tensor = lora_index_tensor - self.batch_mlength_list = batch_mlength_lst + self.batch_mlen_stage_lst = batch_mlen_stage_lst def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 @@ -453,17 +430,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) - batchs, max_length = self.batch_mlength_list[ - 0], self.batch_mlength_list[1] + batch_size, max_length = self.batch_mlen_stage_lst[ + 0], self.batch_mlen_stage_lst[1] sgmv_expand( full_lora_a_embeddings, self.lora_b_stacked, full_output, - self.b_seq_start_tensor[:batchs], - self.seq_length_tensor[:batchs], - self.lora_index_tensor[:batchs], - batchs, + self.b_seq_start_tensor[:batch_size], + self.seq_length_tensor[:batch_size], + self.indices[:batch_size], + batch_size, max_length, True, ) @@ -524,8 +501,7 @@ def create_lora_weights( self.indices_len: List[int] self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -563,39 +539,27 @@ def set_lora( lora_b.T, non_blocking=True) def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): self.indices = base_indices self.indices_len = indices_len - - def set_kernel_mapping( - self, - seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], - ): self.seq_length_tensor = seq_length_tensor self.b_seq_start_tensor = b_seq_start_tensor - self.lora_index_tensor = lora_index_tensor - self.batch_mlength_list = batch_mlength_lst + self.batch_mlen_stage_lst = batch_mlen_stage_lst def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - batch_size = self.batch_mlength_list[0] + batch_size = self.batch_mlen_stage_lst[0] # maybe we need not restrict range to [:batch_size] _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor[:batch_size], self.seq_length_tensor[:batch_size], - self.lora_index_tensor[:batch_size], - self.batch_mlength_list, output) + self.indices[:batch_size], self.batch_mlen_stage_lst, + output) return output def forward(self, input_): @@ -686,11 +650,12 @@ def create_lora_weights( self.output_dim = self.lora_b_stacked[0].shape[2] # Lazily initialized. self.indices: torch.Tensor + self.indices_len: torch.Tensor self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 @@ -756,15 +721,15 @@ def apply(self, x: torch.Tensor, # output, # (self.output_dim, self.output_dim), # ) - batchs = self.batch_mlength_list[0] + batch_size = self.batch_mlen_stage_lst[0] _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batchs], - self.seq_length_tensor[:batchs], - self.lora_index_tensor[:batchs], - self.batch_mlength_list, + self.b_seq_start_tensor[:batch_size], + self.seq_length_tensor[:batch_size], + self.indices[:batch_size], + self.batch_mlen_stage_lst, output, (self.output_dim, self.output_dim), ) @@ -940,12 +905,12 @@ def create_lora_weights( self.packed_indices: Optional[torch.Tensor] = None self.standard_indices: Optional[torch.Tensor] = None # lazily initialized. + self.indices: torch.Tensor self.indices_len: List[int] self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 @@ -1032,15 +997,15 @@ def apply(self, x: torch.Tensor, # output, # self.output_slices, # ) - batchs = self.batch_mlength_list[0] + batch_size = self.batch_mlen_stage_lst[0] _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batchs], - self.seq_length_tensor[:batchs], - self.lora_index_tensor[:batchs], - self.batch_mlength_list, + self.b_seq_start_tensor[:batch_size], + self.seq_length_tensor[:batch_size], + self.indices[:batch_size], + self.batch_mlen_stage_lst, output, self.output_slices, ) @@ -1104,7 +1069,7 @@ def create_lora_weights( self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -1142,38 +1107,26 @@ def set_lora( lora_b.T, non_blocking=True) def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): self.indices = base_indices self.indices_len = indices_len - - def set_kernel_mapping( - self, - seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], - ): self.seq_length_tensor = seq_length_tensor self.b_seq_start_tensor = b_seq_start_tensor - self.lora_index_tensor = lora_index_tensor - self.batch_mlength_list = batch_mlength_lst + self.batch_mlen_stage_lst = batch_mlen_stage_lst def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - batch_size = self.batch_mlength_list[0] + batch_size = self.batch_mlen_stage_lst[0] # maybe we need not restrict range to [:batch_size] _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor[:batch_size], self.seq_length_tensor[:batch_size], - self.lora_index_tensor[:batch_size], - self.batch_mlength_list, output) + self.indices[:batch_size], self.batch_mlen_stage_lst, + output) return output # def apply(self, x: torch.Tensor) -> torch.Tensor: @@ -1187,13 +1140,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # self.indices[:self.indices_len[0]], # output, # ) - # batch_size = self.batch_mlength_list[0] - # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, + # batch_size = self.batch_mlen_stage_lst[0] + # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, # self.lora_b_stacked, # self.b_seq_start_tensor[:batch_size], # self.seq_length_tensor[:batch_size], - # self.lora_index_tensor[:batch_size], - # self.batch_mlength_list, output) + # self.indices[:batch_size], + # self.batch_mlen_stage_lst, output) # flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2) # # if not flag: # # print("error") @@ -1334,7 +1287,7 @@ def create_lora_weights( self.seq_length_tensor: torch.Tensor self.b_seq_start_tensor: torch.Tensor self.lora_index_tensor: torch.Tensor - self.batch_mlength_list: List[int] + self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -1361,29 +1314,17 @@ def set_lora( shape[1], ] = embeddings_tensor def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): self.indices = sampler_indices self.indices_padded = sampler_indices_padded self.indices_len = indices_len - - def set_kernel_mapping( - self, - seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - lora_index_tensor: torch.Tensor, - batch_mlength_lst: List[int], - ): self.seq_length_tensor = seq_length_tensor self.b_seq_start_tensor = b_seq_start_tensor - self.lora_index_tensor = lora_index_tensor - self.batch_mlength_list = batch_mlength_lst + self.batch_mlen_stage_lst = batch_mlen_stage_lst def _get_logits( self, @@ -1431,12 +1372,12 @@ def _get_logits( logits, ) - # batch_size=self.batch_mlength_list[0] + # batch_size=self.batch_mlen_stage_lst[0] # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked, # self.b_seq_start_tensor[:batch_size], # self.seq_length_tensor[:batch_size], # self.indices[:self.indices_len[1]], - # self.batch_mlength_list, logits_temp) + # self.batch_mlen_stage_lst, logits_temp) # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2) # if flag: # print("pass") @@ -1517,14 +1458,11 @@ def set_lora( ... def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): + self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, + indices_len: List[int], seq_length_tensor: torch.Tensor, + b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): self.long_lora_indices = long_lora_indices self.indices_len = indices_len diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 96e2e51bd93e..1cdc3a03b8bf 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -20,6 +20,12 @@ parse_fine_tuned_lora_name, replace_submodule) from vllm.utils import LRUCache, is_pin_memory_available +# NOTE: The number of _MAX_BATCHS derived from worker's model_runner. +# _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE +# is changed. + +_MAX_BATCHS = 256+16 #max(_BATCH_SIZES_TO_CAPTURE)+16 + logger = init_logger(__name__) _GLOBAL_LORA_ID = 0 @@ -83,7 +89,7 @@ def convert_mapping( """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() - lora_indices = index_mapping_indices.copy() + lora_indices = mapping.batch_mapping.copy() long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), @@ -93,22 +99,27 @@ def convert_mapping( lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping ] - lora_idx = None + token_lora_idx = None for i in range(len(index_mapping_indices)): # TODO index can be slow. optimize - lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) - if index_mapping_indices[i] > 0 else -1) - embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 - lora_indices[i] = lora_idx + token_lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) + if index_mapping_indices[i] > 0 else -1) + embedding_indices[ + i] = token_lora_idx if index_mapping_indices[i] > 0 else 0 if long_lora_context: assert long_lora_offsets is not None lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) long_lora_offsets[i] = lora_offset + # every seq lora_id + for i in range(len(lora_indices)): + lora_indices[i] = (lora_index_to_id.index(lora_indices[i]) + if lora_indices[i] > 0 else -1) indices_list: List[Union[List[int], torch.Tensor]] = [ - index_mapping_indices, lora_indices, embedding_indices + index_mapping_indices, embedding_indices ] + base_indices = torch.tensor(lora_indices, dtype=torch.long, device="cuda") if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) @@ -117,11 +128,11 @@ def convert_mapping( device="cuda", dtype=torch.long) embeddings_indices = torch.stack([ - indices[2] * extra_vocab_size, - indices[2] * (vocab_size + extra_vocab_size) + indices[1] * extra_vocab_size, + indices[1] * (vocab_size + extra_vocab_size) ]) embeddings_indices[embeddings_indices == -1] = max_loras - 1 - base_indices = indices[1] + sampler_indices = prompt_mapping_tensor sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 @@ -132,7 +143,7 @@ def convert_mapping( long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: - long_lora_indices = indices[3] + long_lora_indices = indices[2] long_lora_indices_len = long_lora_indices.shape[-1] # Contain length of indices tensors. Used to index into each tensor. indices_len = [ @@ -400,6 +411,7 @@ def __init__( self.max_num_batched_tokens, dtype=torch.long, device="cuda") + self.long_lora_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, device="cuda") @@ -429,17 +441,19 @@ def __init__( self._last_mapping: Optional[LoRAMapping] = None # triton kernel mapping - - self.batch_mlength_lst = [-1] * 2 - self.seq_length_tensor = torch.empty(self.max_num_batched_tokens, + self.seq_length_tensor = torch.empty(_MAX_BATCHS, dtype=torch.long, device="cuda") - self.b_seq_start_tensor = torch.zeros(self.max_num_batched_tokens, + self.b_seq_start_tensor = torch.zeros(_MAX_BATCHS, dtype=torch.long, device="cuda") - self.lora_index_tensor = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") + + # element contains batch_size, max_length, 0 or 1. Use 1 for the + # prefilling stage and 0 for the decoding stage.The reason for + # distinguishing between the prefilling and decoding stage is that + # if we have implemented bgmv, it can be utilized during the decoding + # stage. + self.batch_mlen_stage_lst = [-1] * 3 self._create_lora_modules() self.model.lora_manager = self @@ -561,35 +575,23 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: # Maintain the reference self.indices_len[:] = indices_len - if mapping.seq_lens: + # Mapping for sgmv kernel + if mapping.seq_lens and mapping.batch_mapping: batchs = len(mapping.seq_lens) seq_length_tensor = torch.tensor(mapping.seq_lens, dtype=torch.long, device="cuda") self.seq_length_tensor[:batchs].copy_(seq_length_tensor) - # b_seq_start_tensor = torch.zeros(seq_length_tensor.shape[0] + 1, - # dtype=torch.long, - # device="cuda") - # torch.cumsum(seq_length_tensor, - # dim=0, - # dtype=seq_length_tensor.dtype, - # out=b_seq_start_tensor[1:]) - torch.cumsum(seq_length_tensor, - dim=0, - dtype=seq_length_tensor.dtype, - out=self.b_seq_start_tensor[1:]) - # self.b_seq_start_tensor[:batchs].copy_(b_seq_start_tensor) - lora_id_lst = [] - for lora_index in mapping.batch_mapping: - lora_id_lst.append( - self.lora_index_to_id.index(lora_index - ) if lora_index > 0 else -1) - lora_id_tensor = torch.tensor(lora_id_lst, - dtype=torch.long, - device="cuda") - self.lora_index_tensor[:lora_id_tensor.size(0)].copy_( - lora_id_tensor) - self.batch_mlength_lst[:] = [batchs, max(mapping.seq_lens)] + temp_tensor=torch.cumsum( + seq_length_tensor, + dim=0, + dtype=seq_length_tensor.dtype) + self.b_seq_start_tensor[1:temp_tensor.size(0)+1].copy_(temp_tensor) + + self.batch_mlen_stage_lst[:] = [ + batchs, + max(mapping.seq_lens), 1 if mapping.is_prefilling else 0 + ] def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: @@ -642,11 +644,10 @@ def _create_lora_modules(self): new_module.set_mapping(self.base_indices, self.sampler_indices, self.sampler_indices_padded, self.embeddings_indices, - self.long_lora_indices, self.indices_len) - new_module.set_kernel_mapping(self.seq_length_tensor, - self.b_seq_start_tensor, - self.lora_index_tensor, - self.batch_mlength_lst) + self.long_lora_indices, self.indices_len, + self.seq_length_tensor, + self.b_seq_start_tensor, + self.batch_mlen_stage_lst) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index c68c551db89e..f2af7be4ad62 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -1,3 +1,10 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + import torch import triton import triton.language as tl diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index b0bf8015431e..72ed81bcbbd3 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -1,3 +1,10 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + import torch import triton import triton.language as tl diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index b8d0d8a23c8c..b5b0569b54d3 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -1,3 +1,10 @@ + +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" import torch import triton import triton.language as tl From 2dfeb97c9d1e15ba1f5b78187f82c71d2f2ecb63 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 29 May 2024 19:19:17 +0800 Subject: [PATCH 07/71] optimize code --- vllm/lora/ops/sgmv_shrink.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index b5b0569b54d3..2727efbd57b6 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -50,6 +50,8 @@ def _sgmv_shrink_kernel( if pid_m * BLOCK_M > M: return lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return cur_seq_start = tl.load(b_seq_start_loc + cur_batch) offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N From 5e55ab8d69219e7d9567f171b3510871b522c95e Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 30 May 2024 07:42:04 +0800 Subject: [PATCH 08/71] add bgmv --- vllm/lora/ops/bgmv_expand.py | 156 ++++++++++++++++++++++++++ vllm/lora/ops/bgmv_expand_slice.py | 169 +++++++++++++++++++++++++++++ vllm/lora/ops/bgmv_shrink.py | 139 ++++++++++++++++++++++++ vllm/lora/ops/sgmv_expand_slice.py | 2 +- vllm/lora/ops/sgmv_shrink.py | 46 +++++--- 5 files changed, 496 insertions(+), 16 deletions(-) create mode 100644 vllm/lora/ops/bgmv_expand.py create mode 100644 vllm/lora/ops/bgmv_expand_slice.py create mode 100644 vllm/lora/ops/bgmv_shrink.py diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py new file mode 100644 index 000000000000..19c8d511ff9c --- /dev/null +++ b/vllm/lora/ops/bgmv_expand.py @@ -0,0 +1,156 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +import triton +import triton.language as tl +import torch + + +@triton.jit +def _bgmv_expand_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + lora_indices, + xm_stride, + xk_stride, + l0_stride, + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, +): + pid_n = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = tl.arange(0, BLOCK_K) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + # a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride + a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride + ) + accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty) + for k in range(0, tl.cdiv(K, BLOCK_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + k_remaining = K - k * BLOCK_K + tiled_a = tl.load( + a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 + ) + tiled_b = tl.load( + b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 + ) + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1) + a_ptr += BLOCK_K * xk_stride + b_ptr += BLOCK_K * lora_n_stride + + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride + c_mask = offset_cn[None, :] < N + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + accumulator += tiled_out + tl.store(c_ptr, accumulator, mask=c_mask) + + +@torch.inference_mode() +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + add_inputs: bool = False, +): + """ + Args: + inputs (torch.Tensor): input tensor + lora_b_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + add_inputs (bool, optional): _description_. Defaults to False. + cast_type (bool, optional): _description_. Defaults to False. + """ + + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + torch.float32, + ] + assert inputs.size(1) == lora_b_weights.size(-1) + + assert lora_indices_tensor.size(0) == batchs + assert inputs.is_contiguous() + assert output_tensor.is_contiguous() + + if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_b_weights.size(1) == 1 + lora_b_weights = lora_b_weights.squeeze(dim=1) + else: + assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) + + assert lora_b_weights.is_contiguous() + + # TODO tuning this config + + N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size + BLOCK_N = 32 + BLOCK_K = 16 + EVEN_K = K % BLOCK_K == 0 + ADD_INPUTS = add_inputs + CAST_TYPE = False + if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True + grid = [ + triton.cdiv(N, BLOCK_N), + batchs, + ] + _bgmv_expand_kernel[grid]( + inputs, + lora_b_weights, + output_tensor, + N, + K, + lora_indices_tensor, + inputs.stride(0), + inputs.stride(1), + lora_b_weights.stride(0), + lora_b_weights.stride(1), + lora_b_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + BLOCK_N, + BLOCK_K, + EVEN_K, + ADD_INPUTS, + CAST_TYPE, + ) + return diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py new file mode 100644 index 000000000000..0404f2383d10 --- /dev/null +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -0,0 +1,169 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +import triton +import triton.language as tl +import torch + +@triton.jit +def _bgmv_expand_slice_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + lora_indices, + xm_stride, + xk_stride, + l0_stride, + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + slice_offset, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, +): + pid_n = tl.program_id(axis=0) + pid_sk = tl.program_id(axis=1) + cur_batch = tl.program_id(axis=2) + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = tl.arange(0, BLOCK_K) + offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride + ) + accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty) + for k in range(0, tl.cdiv(K, BLOCK_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + k_remaining = K - k * BLOCK_K + tiled_a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0) + tiled_b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0) + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1) + a_ptr += BLOCK_K * xk_stride + b_ptr += BLOCK_K * lora_n_stride + + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset + c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride + c_mask = offset_cn[None, :] < (slice_offset+N) + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + accumulator += tiled_out + tl.store(c_ptr, accumulator, mask=c_mask) + + +@torch.inference_mode() +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + max_seq_length: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False, +): + """_summary_ + + Args: + inputs (torch.Tensor): input tensor + lora_b_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4, 10]. + seq_len_tensor (torch.Tensor): (batch_size,). record the sequence + length of the sequences in the batch + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + max_seq_length (int): The max sequence lengths of the sequences + in the batch + slice_offst (int): output_tensor's offst + slice_size (int): current output_tensor's size + add_inputs (bool, optional): _description_. Defaults to False. + """ + + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + torch.float32, + ] + assert inputs.size(1) == lora_b_weights.size(-1) + assert lora_indices_tensor.size(0) == batchs + assert slice_size == lora_b_weights.size(-2) + assert inputs.is_contiguous() + assert output_tensor.is_contiguous() + + if lora_b_weights.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_b_weights.size(1) == 1 + lora_b_weights = lora_b_weights.squeeze(dim=1) + else: + assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) + + assert lora_b_weights.is_contiguous() + + # TODO tuning this config + N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size + + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_K = 16 + EVEN_K = K % BLOCK_K == 0 + ADD_INPUTS = add_inputs + CAST_TYPE = False + if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True + grid = [ + triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + batchs, + ] + _bgmv_expand_slice_kernel[grid]( + inputs, + lora_b_weights, + output_tensor, + N, + K, + lora_indices_tensor, + inputs.stride(0), + inputs.stride(1), + lora_b_weights.stride(0), + lora_b_weights.stride(1), + lora_b_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + slice_offset, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + ADD_INPUTS, + CAST_TYPE, + ) + return diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py new file mode 100644 index 000000000000..eeeff502eb5b --- /dev/null +++ b/vllm/lora/ops/bgmv_shrink.py @@ -0,0 +1,139 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +import triton +import triton.language as tl +import torch + +@triton.jit +def _bgmv_shrink_kernel( + input_ptr, + lora_ptr, + out_ptr, + N, + K, + lora_indices, + scaling, + xm_stride, + xk_stride, + l0_stride, + lora_k_stride, + lora_n_stride, + cm_stride, + cn_stride, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, +): + pid_n = tl.program_id(axis=0) + pid_sk = tl.program_id(axis=1) + cur_batch = tl.program_id(axis=2) + lora_index = tl.load(lora_indices + cur_batch) + if lora_index == -1: + return + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride + ) + accumulator = tl.zeros((1,BLOCK_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + tiled_a = tl.load( + a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 + ) + tiled_b = tl.load( + b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 + ) + accumulator += tl.sum(tiled_a[None,:] * tiled_b, 1) + a_ptr += BLOCK_K * SPLIT_K * xk_stride + b_ptr += BLOCK_K * SPLIT_K * lora_n_stride + accumulator *= scaling + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride + c_mask = offset_cn[None, :] < N + if SPLIT_K: + tl.store(c_ptr, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptr, accumulator, mask=c_mask) + + +@torch.inference_mode() +def bgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batchs: int, + scaling: float, +): + """ + + Args: + inputs (torch.Tensor): input tensor + lora_a_weights (torch.Tensor): lora'a weight + output_tensor (torch.Tensor): output tensor + lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batchs (int): batch size + scaling (float): Scaling factor. + """ + assert inputs.dtype == lora_a_weights.dtype + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert inputs.size(1) == lora_a_weights.size(-1) + assert lora_indices_tensor.size(0) == batchs + assert inputs.is_contiguous() + + if lora_a_weights.ndim == 4: # shape:(lora_num,1,rank, size) + assert lora_a_weights.size(1) == 1 + lora_a_weights = lora_a_weights.squeeze(dim=1) + else: + assert lora_a_weights.ndim == 3 # shape:(lora_num,rank, size) + assert lora_a_weights.is_contiguous() + assert output_tensor.is_contiguous() + # TODO tuning this config + N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank + BLOCK_N = 16 + BLOCK_K = 32 + SPLIT_K = 1 + EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 + grid = [ + triton.cdiv(N, BLOCK_N), + SPLIT_K, + batchs, + ] + _bgmv_shrink_kernel[grid]( + inputs, + lora_a_weights, + output_tensor, + N, + K, + lora_indices_tensor, + scaling, + inputs.stride(0), + inputs.stride(1), + lora_a_weights.stride(0), + lora_a_weights.stride(1), + lora_a_weights.stride(2), + output_tensor.stride(0), + output_tensor.stride(1), + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + ) + return diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 72ed81bcbbd3..41e65d2a15d4 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel( offset_k = tl.arange(0, BLOCK_K) ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride, ) b_ptr = (lora_ptr + l0_stride * lora_index + diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 2727efbd57b6..6a94aedde9d5 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -1,10 +1,10 @@ - """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). Punica: Multi-Tenant LoRA Serving. https://arxiv.org/abs/2310.18547 """ + import torch import triton import triton.language as tl @@ -60,31 +60,47 @@ def _sgmv_shrink_kernel( ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + - offset_k[None, :] * xk_stride) - b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride + - offset_k[:, None] * lora_n_stride) + a_ptr = ( + input_ptr + + cur_seq_start * xm_stride + + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride + ) + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride + ) accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): if EVEN_K: - a = tl.load(a_ptr) - b = tl.load(b_ptr) + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) else: k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0) - b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0) - accumulator += tl.dot(a, b) + tiled_a = tl.load( + a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 + ) + tiled_b = tl.load( + b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 + ) + accumulator += tl.dot(tiled_a, tiled_b) a_ptr += BLOCK_K * SPLIT_K * xk_stride b_ptr += BLOCK_K * SPLIT_K * lora_n_stride offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + - offset_cn[None, :] * cn_stride) - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < N) + c_ptr = ( + out_ptr + + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride + ) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < N + ) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: @@ -159,7 +175,7 @@ def sgmv_shrink( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, - scaling, + scaling, inputs.stride(0), inputs.stride(1), lora_a_weights.stride(0), From 79c07ab225deb441d3dd45aee10eaa5d42977470 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 31 May 2024 19:29:53 +0800 Subject: [PATCH 09/71] modify bgmv --- tests/lora/test_triton_punica.py | 492 +++++++++++++++++++++++++++++ vllm/lora/ops/bgmv_expand.py | 84 +++-- vllm/lora/ops/bgmv_expand_slice.py | 108 +++---- vllm/lora/ops/bgmv_shrink.py | 84 +++-- vllm/lora/ops/sgmv_shrink.py | 45 ++- 5 files changed, 655 insertions(+), 158 deletions(-) create mode 100644 tests/lora/test_triton_punica.py diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py new file mode 100644 index 000000000000..74bab70f1aad --- /dev/null +++ b/tests/lora/test_triton_punica.py @@ -0,0 +1,492 @@ +import random + +import pytest +import torch + +import vllm.lora.punica as punica +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice + +# The current punica kernel supports dimension and adds a dimension of 3424. +HIDDEN_SIZES = [ + 128, + 256, + 512, + 1024, + 1152, + 1280, + 1536, + 2048, + 2304, + 2560, + 2752, + 3072, + 3424, + 3456, + 3584, + 4096, + 4608, + 5120, + 5504, + 5632, + 6144, + 6848, + 6912, + 7168, + 8192, + 9216, + 10240, + 11008, + 13824, + 14336, + 15360, + 22016, + 24576, + 27392, + 27648, + 32000, + 32256, + 32512, + 32768, + 33024, + 36864, + 43264, + 49152, + 64000, + 64256, + 102400, + 102656, + 128000, + 128256, +] +BATCHS = [i for i in range(0, 64, 8)] +NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] +DTYPES = [torch.half, torch.bfloat16, torch.float32] +MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +SCALES = [0.5] +OP_TYPES = ["shrink", "expand"] +SEED = [0] +CUDA_DEVICES = [f"cuda:{0}"] + + +def assert_close(a, b): + rtol, atol = { + torch.float16: (1e-2, 1e-2), + torch.bfloat16: (12e-2, 1e-2), + torch.float32: (1e-2, 1e-2), + }[a.dtype] + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) + + +@torch.inference_mode() +def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling): + layer_idx = 0 + punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling) + return + + +def _torch_groupgemm( + out_tensor, + inputs, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, + scaling, + op_type, +) -> torch.Tensor: + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batchs), seq_len_tensor): + input_weight = inputs[current_offset : b_length + current_offset, :] + current_offset += b_length + lora_weight = lora_weights[lora_indices_tensor[lora_index]] + result = torch.nn.functional.linear(input_weight, lora_weight) + result *= scaling + out_list.append(result) + cat_result = torch.cat(out_list, dim=0) + if op_type == "expand": + out_tensor += cat_result + else: + out_tensor.copy_(cat_result) + return + + +def _generate_data( + batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device +): + if max_length == 1: + max_length += 1 + seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + if op_type == "shrink": + inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to( + device + ) + lora_weights = torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).to(device) + # shrink op need atomic_add, so output is initinized by 0 + ref_out_tensor = torch.zeros( + (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device + ) + # NOTE shrink kernel using torch.float32 as output type + our_out_tensor = torch.zeros( + (total_tokens, max_rank), + dtype=torch.float32, + device=inputs_tensor.device, + ) + else: + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights = torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand( + (total_tokens, hidden_size), + dtype=dtype, + device=inputs_tensor.device, + ) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() + + lora_indices_tensor = torch.randint( + 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) + ).to(device) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batchs): + lora_index = lora_indices_tensor[b_id] + indices[ + current_offset : current_offset + seq_len_tensor[b_id] + ] = lora_index.item() + current_offset += seq_len_tensor[b_id].item() + return ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + + +@pytest.mark.skip("work in progress") +@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sgmv_torch( + batchs: int, + num_loras: int, + rank: int, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + torch.manual_seed(seed) + if batchs == 0: + batchs += 1 + hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) + hidden_size = HIDDEN_SIZES[hidden_size_index] + if hidden_size > 100000: + hidden_size = hidden_size // 4 # avoid OOM + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data( + batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device + ) # The sequence length is restricted to the range [1, 1024]. + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + add_inputs=True, + ) + _torch_groupgemm( + ref_out_tensor, + inputs_tensor, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, + scaling if op_type == "shrink" else 1.0, + op_type, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.skip("work in progress") +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_triton_sgmv_punica_bgmv( + hidden_size, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + batchs = 4 # Arbitrary values for testing + rank = 16 + seq_len = 333 # Arbitrary values for testing + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data( + batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device + ) + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_triton_bgmv_punica_bgmv( + batchs: int, + hidden_size: int, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + if batchs == 0: + batchs += 1 + rank = 16 + seq_len = 1 # + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data( + batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device + ) + + if op_type == "shrink": + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + lora_indices_tensor, + batchs, + scaling, + ) + else: + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + lora_indices_tensor, + batchs, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.skip("work in progress") +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sgmv_expand_nslice( + hidden_size, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + batchs = 4 # Arbitrary values for testing + rank = 16 + seq_len = 333 # Arbitrary values for testing + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data( + batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device + ) + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + + sgmv_expand_slice( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + 1024, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + + assert_close(our_out_tensor, ref_out_tensor) + + +if __name__ == "__main__": + test_triton_bgmv_punica_bgmv( + batchs=1, + hidden_size=128, + scaling=0.5, + dtype=torch.float16, + op_type="expand", + seed=0, + device="cuda:0", + ) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 19c8d511ff9c..7762276b65ce 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -27,52 +27,51 @@ def _bgmv_expand_kernel( cn_stride, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - EVEN_K: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): - pid_n = tl.program_id(axis=0) - cur_batch = tl.program_id(axis=1) + """ + C=A@B, and B is col-major matrix + """ + cur_batch = tl.program_id(axis=0) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return - offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N offset_k = tl.arange(0, BLOCK_K) - rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - # a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride - a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + rbn[None, :] * lora_k_stride - + offset_k[:, None] * lora_n_stride - ) - accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty) - for k in range(0, tl.cdiv(K, BLOCK_K)): - if EVEN_K: - tiled_a = tl.load(a_ptr) - tiled_b = tl.load(b_ptr) - else: - k_remaining = K - k * BLOCK_K - tiled_a = tl.load( - a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 - ) - tiled_b = tl.load( - b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 - ) - if CAST_TYPE: - tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) - accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1) - a_ptr += BLOCK_K * xk_stride - b_ptr += BLOCK_K * lora_n_stride + offset_n = tl.arange(0, BLOCK_N) + # tl.max_contiguous(offset_k, BLOCK_K) + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + mask=offset_k < K, + other=0, + ) # [BLOCK_K] + b_ptr = lora_ptr + l0_stride * lora_index + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + # sliding to next row-block + + for n in range(0, N, BLOCK_N): + current_n = n + offset_n + # vector load + current_n_c = tl.max_contiguous(current_n, BLOCK_N) + b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K) + + tiled_b = tl.load( + b_ptr + + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, + mask=b_ptr_mask, + other=0.0, + ) # [BLOCK_N,BLOCK_K] + + accumulator = tl.sum(tiled_a * tiled_b, 1) - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride - c_mask = offset_cn[None, :] < N - if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) - accumulator += tiled_out - tl.store(c_ptr, accumulator, mask=c_mask) + c_ptr = out_ptr + cur_batch * cm_stride + current_n * cn_stride + c_mask = current_n < N + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + accumulator += tiled_out + tl.store(c_ptr, accumulator, mask=c_mask) @torch.inference_mode() @@ -119,9 +118,8 @@ def bgmv_expand( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 32 - BLOCK_K = 16 - EVEN_K = K % BLOCK_K == 0 + BLOCK_N = 512 + BLOCK_K = triton.next_power_of_2(K) ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ @@ -130,9 +128,9 @@ def bgmv_expand( ]: CAST_TYPE = True grid = [ - triton.cdiv(N, BLOCK_N), batchs, ] + config = {"num_stages": 4, "num_warps": 8} _bgmv_expand_kernel[grid]( inputs, lora_b_weights, @@ -149,8 +147,8 @@ def bgmv_expand( output_tensor.stride(1), BLOCK_N, BLOCK_K, - EVEN_K, ADD_INPUTS, CAST_TYPE, + **config, ) return diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 0404f2383d10..a197f5eddb8b 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -9,17 +9,18 @@ import triton.language as tl import torch + @triton.jit def _bgmv_expand_slice_kernel( - input_ptr, - lora_ptr, - out_ptr, + input_ptr, + lora_ptr, + out_ptr, N, K, - lora_indices, - xm_stride, - xk_stride, - l0_stride, + lora_indices, + xm_stride, + xk_stride, + l0_stride, lora_k_stride, lora_n_stride, cm_stride, @@ -27,49 +28,56 @@ def _bgmv_expand_slice_kernel( slice_offset, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - EVEN_K: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): - pid_n = tl.program_id(axis=0) - pid_sk = tl.program_id(axis=1) - cur_batch = tl.program_id(axis=2) + """ + C=A@B, and B is col-major matrix + """ + cur_batch = tl.program_id(axis=0) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return - offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N offset_k = tl.arange(0, BLOCK_K) - offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) - rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + rbn[None, :] * lora_k_stride - + offset_k[:, None] * lora_n_stride - ) - accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty) - for k in range(0, tl.cdiv(K, BLOCK_K)): - if EVEN_K: - tiled_a = tl.load(a_ptr) - tiled_b = tl.load(b_ptr) - else: - k_remaining = K - k * BLOCK_K - tiled_a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0) - tiled_b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0) - if CAST_TYPE: - tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) - accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1) - a_ptr += BLOCK_K * xk_stride - b_ptr += BLOCK_K * lora_n_stride + offset_n = tl.arange(0, BLOCK_N) + # tl.max_contiguous(offset_k, BLOCK_K) + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + mask=offset_k < K, + other=0, + ) # [BLOCK_K] + b_ptr = lora_ptr + l0_stride * lora_index + if CAST_TYPE: + tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) + # sliding to next row-block + + for n in range(0, N, BLOCK_N): + current_n = n + offset_n + # vector load + current_n_c = tl.max_contiguous(current_n, BLOCK_N) + b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K) + + tiled_b = tl.load( + b_ptr + + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, + mask=b_ptr_mask, + other=0.0, + ) # [BLOCK_N,BLOCK_K] - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset - c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride - c_mask = offset_cn[None, :] < (slice_offset+N) - if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) - accumulator += tiled_out - tl.store(c_ptr, accumulator, mask=c_mask) + accumulator = tl.sum(tiled_a * tiled_b, 1) + + c_ptr = ( + out_ptr + + cur_batch * cm_stride + + slice_offset # slice size + + current_n * cn_stride + ) + c_mask = current_n < N + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + accumulator += tiled_out + tl.store(c_ptr, accumulator, mask=c_mask) @torch.inference_mode() @@ -126,22 +134,18 @@ def bgmv_expand_slice( assert lora_b_weights.is_contiguous() - # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - - BLOCK_M = 32 - BLOCK_N = 32 - BLOCK_K = 16 - EVEN_K = K % BLOCK_K == 0 + # TODO tuning this config + BLOCK_N = 512 + BLOCK_K = triton.next_power_of_2(K) ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ - triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), batchs, ] _bgmv_expand_slice_kernel[grid]( @@ -159,10 +163,8 @@ def bgmv_expand_slice( output_tensor.stride(0), output_tensor.stride(1), slice_offset, - BLOCK_M, BLOCK_N, BLOCK_K, - EVEN_K, ADD_INPUTS, CAST_TYPE, ) diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index eeeff502eb5b..ac61c9d50bda 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -9,6 +9,7 @@ import triton.language as tl import torch + @triton.jit def _bgmv_shrink_kernel( input_ptr, @@ -27,49 +28,44 @@ def _bgmv_shrink_kernel( cn_stride, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - EVEN_K: tl.constexpr, - SPLIT_K: tl.constexpr, ): - pid_n = tl.program_id(axis=0) - pid_sk = tl.program_id(axis=1) - cur_batch = tl.program_id(axis=2) + cur_batch = tl.program_id(axis=0) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return - offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) - rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + rbn[None, :] * lora_k_stride - + offset_k[:, None] * lora_n_stride - ) - accumulator = tl.zeros((1,BLOCK_N), dtype=tl.float32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - tiled_a = tl.load(a_ptr) - tiled_b = tl.load(b_ptr) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - tiled_a = tl.load( - a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 - ) - tiled_b = tl.load( - b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 - ) - accumulator += tl.sum(tiled_a[None,:] * tiled_b, 1) - a_ptr += BLOCK_K * SPLIT_K * xk_stride - b_ptr += BLOCK_K * SPLIT_K * lora_n_stride + + offset_n = tl.arange(0, BLOCK_N) + offset_k = tl.arange(0, BLOCK_K) + a_ptr = input_ptr + cur_batch * xm_stride + b_ptr = lora_ptr + l0_stride * lora_index + rank_mask = offset_n[:, None] < N + accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32) + for k in range(0, K, BLOCK_K): + current_k = k + offset_k + # vector load + current_k_c = tl.max_contiguous(current_k, BLOCK_K) + tiled_a = tl.load( + a_ptr + current_k_c * xk_stride, + mask=current_k < K, + other=0.0, + ) # [BLOCK_K] + b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K) + + tiled_b = tl.load( + b_ptr + + offset_n[:, None] * lora_k_stride + + current_k[None, :] * lora_n_stride, + mask=b_ptr_mask, + other=0.0, + ) # [BLOCK_N,BLOCK_K] + + accumulator += tl.sum(tiled_a * tiled_b, 1) accumulator *= scaling - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride - c_mask = offset_cn[None, :] < N - if SPLIT_K: - tl.store(c_ptr, accumulator, mask=c_mask) - else: - tl.atomic_add(c_ptr, accumulator, mask=c_mask) + offset_cn = tl.arange(0, BLOCK_N) + c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride + c_mask = offset_cn < N + + tl.store(c_ptr, accumulator, mask=c_mask) @torch.inference_mode() @@ -107,15 +103,12 @@ def bgmv_shrink( assert output_tensor.is_contiguous() # TODO tuning this config N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - BLOCK_N = 16 - BLOCK_K = 32 - SPLIT_K = 1 - EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 + BLOCK_K = 512 + BLOCK_N = triton.next_power_of_2(output_tensor.size(1)) grid = [ - triton.cdiv(N, BLOCK_N), - SPLIT_K, batchs, ] + config = {"num_stages": 4, "num_warps": 8} _bgmv_shrink_kernel[grid]( inputs, lora_a_weights, @@ -133,7 +126,6 @@ def bgmv_shrink( output_tensor.stride(1), BLOCK_N, BLOCK_K, - EVEN_K, - SPLIT_K, + **config, ) return diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 6a94aedde9d5..65bf1a6a5d47 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -120,6 +120,7 @@ def sgmv_shrink( batchs: int, max_seq_length: int, scaling: float, + config: dict, ): """ @@ -156,16 +157,26 @@ def sgmv_shrink( assert output_tensor.is_contiguous() # TODO tuning this config N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - BLOCK_M = 32 - BLOCK_N = 16 - BLOCK_K = 32 - SPLIT_K = 16 - EVEN_K = False - grid = [ - triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), - SPLIT_K, - batchs, - ] + # BLOCK_M = config.get("BLOCK_M", 32) + # BLOCK_N = config.get("BLOCK_N", 32) + # BLOCK_K = config.get("BLOCK_K", 32) + # SPLIT_K = config.get("SPLIT_K", 16) + # num_warps = config.get("num_warps", 4) + # num_stages = config.get("num_stages", 3) + # BLOCK_M = 32 + # BLOCK_N = 16 + # BLOCK_K = 32 + # SPLIT_K = 16 + EVEN_K = K % config.get("BLOCK_K", 32) == 0 + # grid = [ + # triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + # SPLIT_K, + # batchs, + # ] + + grid = lambda META: (triton.cdiv(max_seq_length, META[ + 'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[ + 'SPLIT_K'],batchs) _sgmv_shrink_kernel[grid]( inputs, lora_a_weights, @@ -175,7 +186,7 @@ def sgmv_shrink( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, - scaling, + scaling, inputs.stride(0), inputs.stride(1), lora_a_weights.stride(0), @@ -183,10 +194,12 @@ def sgmv_shrink( lora_a_weights.stride(2), output_tensor.stride(0), output_tensor.stride(1), - BLOCK_M, - BLOCK_N, - BLOCK_K, - EVEN_K, - SPLIT_K, + EVEN_K=EVEN_K, + **config + # BLOCK_M, + # BLOCK_N, + # BLOCK_K, + # EVEN_K, + # SPLIT_K, ) return From e2f56d5774e23227ab2f8578190665a62c23cfe1 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 31 May 2024 22:44:46 +0800 Subject: [PATCH 10/71] resolve conflict --- tests/lora/test_triton_sgmv.py | 396 --------------------------------- vllm/worker/model_runner.py | 2 +- 2 files changed, 1 insertion(+), 397 deletions(-) delete mode 100644 tests/lora/test_triton_sgmv.py diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py deleted file mode 100644 index db3739f35d24..000000000000 --- a/tests/lora/test_triton_sgmv.py +++ /dev/null @@ -1,396 +0,0 @@ -import random - -import pytest -import torch - -import vllm.lora.punica as punica -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice - -# The current punica kernel supports dimension and adds a dimension of 3424. -HIDDEN_SIZES = [ - 128, - 256, - 512, - 1024, - 1152, - 1280, - 1536, - 2048, - 2304, - 2560, - 2752, - 3072, - 3424, - 3456, - 3584, - 4096, - 4608, - 5120, - 5504, - 5632, - 6144, - 6848, - 6912, - 7168, - 8192, - 9216, - 10240, - 11008, - 13824, - 14336, - 15360, - 22016, - 24576, - 27392, - 27648, - 32000, - 32256, - 32512, - 32768, - 33024, - 36864, - 43264, - 49152, - 64000, - 64256, - 102400, - 102656, - 128000, - 128256, -] -BATCHS = [i for i in range(0, 64, 8)] -NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] -DTYPES = [torch.half, torch.bfloat16, torch.float32] -MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] -SCALES = [0.5] -OP_TYPES = ["shrink", "expand"] -SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] - - -def assert_close(a, b): - rtol, atol = { - torch.float16: (1e-2, 1e-2), - torch.bfloat16: (12e-2, 1e-2), - torch.float32: (1e-2, 1e-2), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - - -@torch.inference_mode() -def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling): - layer_idx = 0 - punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling) - return - - -def _torch_groupgemm( - out_tensor, - inputs, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batchs, - scaling, - op_type, -) -> torch.Tensor: - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batchs), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_weights[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - result *= scaling - out_list.append(result) - cat_result = torch.cat(out_list, dim=0) - if op_type == "expand": - out_tensor += cat_result - else: - out_tensor.copy_(cat_result) - return - - -def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, - op_type, device): - seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).to(device) - total_tokens = seq_len_tensor.sum() - if op_type == "shrink": - inputs_tensor = torch.rand((total_tokens, hidden_size), - dtype=dtype).to(device) - lora_weights = torch.rand( - (lora_nums, max_rank, hidden_size), # col-major - dtype=dtype, - ).to(device) - # shrink op need atomic_add, so output is initinized by 0 - ref_out_tensor = torch.zeros((total_tokens, max_rank), - dtype=dtype, - device=inputs_tensor.device) - # NOTE shrink kernel using torch.float32 as output type - our_out_tensor = torch.zeros( - (total_tokens, max_rank), - dtype=torch.float32, - device=inputs_tensor.device, - ) - else: - - inputs_tensor = torch.rand( - (total_tokens, max_rank), - dtype=dtype, - ).to(device) - lora_weights = torch.rand( - (lora_nums, hidden_size, max_rank), # col-major - dtype=dtype, - ).to(device) - # expand op needs to complete y+=a@lora_b, so output is - # initinized randomly - ref_out_tensor = torch.rand( - (total_tokens, hidden_size), - dtype=dtype, - device=inputs_tensor.device, - ) - # Ensure the same input. - our_out_tensor = ref_out_tensor.clone() - - lora_indices_tensor = torch.randint(0, - lora_nums - 1 if lora_nums > 1 else 1, - (batchs, )).to(device) - indices = torch.zeros((total_tokens), dtype=torch.long).to(device) - current_offset = 0 - for b_id in range(batchs): - lora_index = lora_indices_tensor[b_id] - indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() - current_offset += seq_len_tensor[b_id].item() - return ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) - - -@pytest.mark.parametrize("batchs", BATCHS) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_torch( - batchs: int, - num_loras: int, - rank: int, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - torch.manual_seed(seed) - if batchs == 0: - batchs += 1 - hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) - hidden_size = HIDDEN_SIZES[hidden_size_index] - if hidden_size > 100000: - hidden_size = hidden_size // 4 # avoid OOM - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, - device) # The sequence length is restricted to the range [1, 1024]. - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - scaling, - ) - else: - sgmv_expand(inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - add_inputs=True) - _torch_groupgemm(ref_out_tensor, inputs_tensor, lora_weights, - lora_indices_tensor, seq_len_tensor, batchs, - scaling if op_type == "shrink" else 1.0, op_type) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_punica_bgmv( - hidden_size, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) - batchs = 4 # Arbitrary values for testing - rank = 16 - seq_len = 333 # Arbitrary values for testing - num_loras = 8 # Arbitrary values for testing - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) - - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - scaling, - ) - else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( - ref_out_tensor, - inputs_tensor, - lora_weights_4d, - indices, - scaling if op_type == "shrink" else 1.0, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.skip("TODO") -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_expand_nslice( - hidden_size, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) - batchs = 4 # Arbitrary values for testing - rank = 16 - seq_len = 333 # Arbitrary values for testing - num_loras = 8 # Arbitrary values for testing - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) - - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - - sgmv_expand_slice( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - 1024, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( - ref_out_tensor, - inputs_tensor, - lora_weights_4d, - indices, - scaling if op_type == "shrink" else 1.0, - ) - - assert_close(our_out_tensor, ref_out_tensor) - - -if __name__ == "__main__": - pytest.main(["test_triton_sgmv.py::test_sgmv_expand_nslice"]) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index aaa8a66c40ab..a3e52a749fb6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -386,7 +386,7 @@ def _prepare_model_input( if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping += [lora_id] * (seq_len - context_len) + lora_index_mapping += [lora_id] * query_len batch_lora_index_mapping += [lora_id if lora_id > 0 else -1] lora_prompt_mapping.extend( [lora_id] * From e0cb42b726c5dc06f4222f676ca301100e15ffbf Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 4 Jun 2024 15:18:25 +0800 Subject: [PATCH 11/71] optimize bgmv_shrink --- vllm/lora/ops/bgmv_shrink.py | 19 ++++++++++++------ vllm/lora/ops/sgmv_shrink.py | 39 +++++++++++++++++------------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index ac61c9d50bda..ed208796633a 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -28,24 +28,26 @@ def _bgmv_shrink_kernel( cn_stride, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + SPLIT_K: tl.constexpr, ): - cur_batch = tl.program_id(axis=0) + pid_sk = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return offset_n = tl.arange(0, BLOCK_N) - offset_k = tl.arange(0, BLOCK_K) + offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K a_ptr = input_ptr + cur_batch * xm_stride b_ptr = lora_ptr + l0_stride * lora_index rank_mask = offset_n[:, None] < N accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32) - for k in range(0, K, BLOCK_K): + for k in range(0, K, BLOCK_K * SPLIT_K): current_k = k + offset_k # vector load current_k_c = tl.max_contiguous(current_k, BLOCK_K) tiled_a = tl.load( - a_ptr + current_k_c * xk_stride, + a_ptr + current_k_c, mask=current_k < K, other=0.0, ) # [BLOCK_K] @@ -64,8 +66,10 @@ def _bgmv_shrink_kernel( offset_cn = tl.arange(0, BLOCK_N) c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride c_mask = offset_cn < N - - tl.store(c_ptr, accumulator, mask=c_mask) + if SPLIT_K == 1: + tl.store(c_ptr, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptr, accumulator, mask=c_mask) @torch.inference_mode() @@ -105,7 +109,9 @@ def bgmv_shrink( N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank BLOCK_K = 512 BLOCK_N = triton.next_power_of_2(output_tensor.size(1)) + SPLIT_K = 16 grid = [ + SPLIT_K, batchs, ] config = {"num_stages": 4, "num_warps": 8} @@ -126,6 +132,7 @@ def bgmv_shrink( output_tensor.stride(1), BLOCK_N, BLOCK_K, + SPLIT_K, **config, ) return diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 65bf1a6a5d47..d27bcd15880b 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -120,7 +120,6 @@ def sgmv_shrink( batchs: int, max_seq_length: int, scaling: float, - config: dict, ): """ @@ -163,20 +162,20 @@ def sgmv_shrink( # SPLIT_K = config.get("SPLIT_K", 16) # num_warps = config.get("num_warps", 4) # num_stages = config.get("num_stages", 3) - # BLOCK_M = 32 - # BLOCK_N = 16 - # BLOCK_K = 32 - # SPLIT_K = 16 - EVEN_K = K % config.get("BLOCK_K", 32) == 0 - # grid = [ - # triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), - # SPLIT_K, - # batchs, - # ] + BLOCK_M = 32 + BLOCK_N = 16 + BLOCK_K = 32 + SPLIT_K = 8 + EVEN_K = K % BLOCK_K == 0 + grid = [ + triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), + SPLIT_K, + batchs, + ] - grid = lambda META: (triton.cdiv(max_seq_length, META[ - 'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[ - 'SPLIT_K'],batchs) + # grid = lambda META: (triton.cdiv(max_seq_length, META[ + # 'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[ + # 'SPLIT_K'],batchs) _sgmv_shrink_kernel[grid]( inputs, lora_a_weights, @@ -194,12 +193,10 @@ def sgmv_shrink( lora_a_weights.stride(2), output_tensor.stride(0), output_tensor.stride(1), - EVEN_K=EVEN_K, - **config - # BLOCK_M, - # BLOCK_N, - # BLOCK_K, - # EVEN_K, - # SPLIT_K, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, ) return From 64416e071a5e12b5aa7170b28d277324f333517f Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 4 Jun 2024 18:44:45 +0800 Subject: [PATCH 12/71] optimize bgmv_expand --- vllm/lora/ops/bgmv_expand.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 7762276b65ce..aa572151fb41 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -27,34 +27,42 @@ def _bgmv_expand_kernel( cn_stride, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + SPLIT_N: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): """ C=A@B, and B is col-major matrix """ - cur_batch = tl.program_id(axis=0) + pid_sn = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) - # tl.max_contiguous(offset_k, BLOCK_K) tiled_a = tl.load( input_ptr + cur_batch * xm_stride + offset_k * xk_stride, mask=offset_k < K, other=0, ) # [BLOCK_K] - b_ptr = lora_ptr + l0_stride * lora_index + + split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - - for n in range(0, N, BLOCK_N): + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride + ) + for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K) + b_ptr_mask = (current_n[:, None] < split_n_length) & ( + offset_k[None, :] < K + ) tiled_b = tl.load( b_ptr @@ -66,8 +74,13 @@ def _bgmv_expand_kernel( accumulator = tl.sum(tiled_a * tiled_b, 1) - c_ptr = out_ptr + cur_batch * cm_stride + current_n * cn_stride - c_mask = current_n < N + c_ptr = ( + out_ptr + + cur_batch * cm_stride + + pid_sn * split_n_length + + current_n * cn_stride + ) + c_mask = current_n < split_n_length if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) accumulator += tiled_out @@ -118,8 +131,9 @@ def bgmv_expand( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 512 + BLOCK_N = 512 BLOCK_K = triton.next_power_of_2(K) + SPLIT_N = 8 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ @@ -128,9 +142,9 @@ def bgmv_expand( ]: CAST_TYPE = True grid = [ + SPLIT_N, batchs, ] - config = {"num_stages": 4, "num_warps": 8} _bgmv_expand_kernel[grid]( inputs, lora_b_weights, @@ -147,8 +161,8 @@ def bgmv_expand( output_tensor.stride(1), BLOCK_N, BLOCK_K, + SPLIT_N, ADD_INPUTS, CAST_TYPE, - **config, ) return From 891df631f3d4d18243c117c7e63183b12f2825fe Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 5 Jun 2024 00:35:01 +0800 Subject: [PATCH 13/71] add bgmv --- tests/lora/test_triton_punica.py | 56 +++++++--------- vllm/lora/layers.py | 15 +++-- vllm/lora/models.py | 22 +++--- vllm/lora/ops/bgmv_expand.py | 29 +++----- vllm/lora/ops/bgmv_expand_slice.py | 56 +++++++--------- vllm/lora/ops/bgmv_shrink.py | 7 +- vllm/lora/ops/sgmv_expand_slice.py | 2 +- vllm/lora/ops/sgmv_shrink.py | 42 +++++------- vllm/lora/punica.py | 103 ++++++++++++++++++++++++++++- 9 files changed, 200 insertions(+), 132 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 74bab70f1aad..6aea2573d962 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -62,7 +62,7 @@ 128000, 128256, ] -BATCHS = [i for i in range(0, 64, 8)] +BATCHS = [i for i in range(0, 128, 8)] NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] DTYPES = [torch.half, torch.bfloat16, torch.float32] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] @@ -101,7 +101,7 @@ def _torch_groupgemm( out_list = [] current_offset = 0 for lora_index, b_length in zip(range(batchs), seq_len_tensor): - input_weight = inputs[current_offset : b_length + current_offset, :] + input_weight = inputs[current_offset:b_length + current_offset, :] current_offset += b_length lora_weight = lora_weights[lora_indices_tensor[lora_index]] result = torch.nn.functional.linear(input_weight, lora_weight) @@ -115,29 +115,27 @@ def _torch_groupgemm( return -def _generate_data( - batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device -): +def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, + op_type, device): if max_length == 1: max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) + seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, ).to(device) total_tokens = seq_len_tensor.sum() if op_type == "shrink": - inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to( - device - ) + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) lora_weights = torch.rand( (lora_nums, max_rank, hidden_size), # col-major dtype=dtype, ).to(device) # shrink op need atomic_add, so output is initinized by 0 - ref_out_tensor = torch.zeros( - (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device - ) + ref_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=dtype, + device=inputs_tensor.device) # NOTE shrink kernel using torch.float32 as output type our_out_tensor = torch.zeros( (total_tokens, max_rank), @@ -163,16 +161,15 @@ def _generate_data( # Ensure the same input. our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint( - 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) - ).to(device) + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batchs, )).to(device) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 for b_id in range(batchs): lora_index = lora_indices_tensor[b_id] - indices[ - current_offset : current_offset + seq_len_tensor[b_id] - ] = lora_index.item() + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() current_offset += seq_len_tensor[b_id].item() return ( inputs_tensor, @@ -186,7 +183,7 @@ def _generate_data( ) -@pytest.mark.skip("work in progress") +# @pytest.mark.skip("work in progress") @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @@ -222,8 +219,8 @@ def test_sgmv_torch( seq_len_tensor, indices, ) = _generate_data( - batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device - ) # The sequence length is restricted to the range [1, 1024]. + batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, + device) # The sequence length is restricted to the range [1, 1024]. max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() @@ -268,7 +265,7 @@ def test_sgmv_torch( assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.skip("work in progress") +# @pytest.mark.skip("work in progress") @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @@ -300,9 +297,8 @@ def test_triton_sgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device - ) + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): @@ -380,9 +376,8 @@ def test_triton_bgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device - ) + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) if op_type == "shrink": bgmv_shrink( @@ -446,9 +441,8 @@ def test_sgmv_expand_nslice( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device - ) + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a3a40ad0bd24..ba7f52ff2fb1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -126,10 +126,11 @@ def _apply_lora_triton( batch_size = batch_mlen_stage_lst[0] max_length = batch_mlen_stage_lst[1] + is_prefilling = bool(batch_mlen_stage_lst[2]) add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, b_seq_start_tensor, seq_length_tensor, lora_index_tensor, - batch_size, max_length, 0, 1.0) + batch_size, max_length, 0, 1.0, is_prefilling) return output.view_as(org_output) @@ -206,7 +207,7 @@ def _apply_lora_triton_nslice( batch_size = batch_mlen_stage_lst[0] max_length = batch_mlen_stage_lst[1] - + is_prefilling = bool(batch_mlen_stage_lst[2]) offset_left = 0 #TODO fuse these kernel for slice_idx in range(len(output_slices)): @@ -214,7 +215,7 @@ def _apply_lora_triton_nslice( lora_b_stacked[slice_idx], b_seq_start_tensor, seq_length_tensor, lora_index_tensor, batch_size, max_length, 0, 1.0, offset_left, - output_slices[slice_idx]) + output_slices[slice_idx], is_prefilling) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -558,8 +559,8 @@ def apply(self, x: torch.Tensor, _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor[:batch_size], self.seq_length_tensor[:batch_size], - self.indices[:batch_size], self.batch_mlen_stage_lst, - output) + self.indices[:batch_size], + self.batch_mlen_stage_lst, output) return output def forward(self, input_): @@ -1125,8 +1126,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor[:batch_size], self.seq_length_tensor[:batch_size], - self.indices[:batch_size], self.batch_mlen_stage_lst, - output) + self.indices[:batch_size], + self.batch_mlen_stage_lst, output) return output # def apply(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 1cdc3a03b8bf..438eeff1ff0c 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ # _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE # is changed. -_MAX_BATCHS = 256+16 #max(_BATCH_SIZES_TO_CAPTURE)+16 +_MAX_BATCHS = 256 + 16 #max(_BATCH_SIZES_TO_CAPTURE)+16 logger = init_logger(__name__) @@ -448,10 +448,10 @@ def __init__( dtype=torch.long, device="cuda") - # element contains batch_size, max_length, 0 or 1. Use 1 for the - # prefilling stage and 0 for the decoding stage.The reason for - # distinguishing between the prefilling and decoding stage is that - # if we have implemented bgmv, it can be utilized during the decoding + # element contains batch_size, max_length, 0 or 1. Use 1 for the + # prefilling stage and 0 for the decoding stage.The reason for + # distinguishing between the prefilling and decoding stage is that + # if we have implemented bgmv, it can be utilized during the decoding # stage. self.batch_mlen_stage_lst = [-1] * 3 self._create_lora_modules() @@ -582,12 +582,12 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: dtype=torch.long, device="cuda") self.seq_length_tensor[:batchs].copy_(seq_length_tensor) - temp_tensor=torch.cumsum( - seq_length_tensor, - dim=0, - dtype=seq_length_tensor.dtype) - self.b_seq_start_tensor[1:temp_tensor.size(0)+1].copy_(temp_tensor) - + temp_tensor = torch.cumsum(seq_length_tensor, + dim=0, + dtype=seq_length_tensor.dtype) + self.b_seq_start_tensor[1:temp_tensor.size(0) + + 1].copy_(temp_tensor) + self.batch_mlen_stage_lst[:] = [ batchs, max(mapping.seq_lens), 1 if mapping.is_prefilling else 0 diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index aa572151fb41..6132b6047997 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -51,35 +51,26 @@ def _bgmv_expand_kernel( if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + pid_sn * split_n_length * lora_k_stride - ) + b_ptr = (lora_ptr + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride) for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < split_n_length) & ( - offset_k[None, :] < K - ) + b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] + < K) tiled_b = tl.load( - b_ptr - + current_n_c[:, None] * lora_k_stride - + offset_k[None, :] * lora_n_stride, + b_ptr + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] accumulator = tl.sum(tiled_a * tiled_b, 1) - c_ptr = ( - out_ptr - + cur_batch * cm_stride - + pid_sn * split_n_length - + current_n * cn_stride - ) + c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + + current_n * cn_stride) c_mask = current_n < split_n_length if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) @@ -137,8 +128,8 @@ def bgmv_expand( ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index a197f5eddb8b..63dc3cabb5a9 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -28,52 +28,51 @@ def _bgmv_expand_slice_kernel( slice_offset, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + SPLIT_N: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): """ C=A@B, and B is col-major matrix """ - cur_batch = tl.program_id(axis=0) + pid_sn = tl.program_id(axis=0) + cur_batch = tl.program_id(axis=1) lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) - # tl.max_contiguous(offset_k, BLOCK_K) tiled_a = tl.load( input_ptr + cur_batch * xm_stride + offset_k * xk_stride, mask=offset_k < K, other=0, ) # [BLOCK_K] - b_ptr = lora_ptr + l0_stride * lora_index + + split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - - for n in range(0, N, BLOCK_N): + b_ptr = (lora_ptr + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride) + for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K) + b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] + < K) tiled_b = tl.load( - b_ptr - + current_n_c[:, None] * lora_k_stride - + offset_k[None, :] * lora_n_stride, + b_ptr + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] accumulator = tl.sum(tiled_a * tiled_b, 1) - c_ptr = ( - out_ptr - + cur_batch * cm_stride - + slice_offset # slice size - + current_n * cn_stride - ) - c_mask = current_n < N + c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + + slice_offset * cn_stride +current_n * cn_stride) + c_mask = current_n < split_n_length if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) accumulator += tiled_out @@ -86,32 +85,23 @@ def bgmv_expand_slice( lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, - max_seq_length: int, slice_offset: int, slice_size: int, + batchs: int, add_inputs: bool = False, ): - """_summary_ - + """ Args: inputs (torch.Tensor): input tensor lora_b_weights (torch.Tensor): lora'a weight output_tensor (torch.Tensor): output tensor - b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative - sequence lengths of the sequences in the batch, used to index - into sequence. E.g.,if the sequence length is [4, 6], it is - [0, 4, 10]. - seq_len_tensor (torch.Tensor): (batch_size,). record the sequence - length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch batchs (int): batch size - max_seq_length (int): The max sequence lengths of the sequences - in the batch slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size add_inputs (bool, optional): _description_. Defaults to False. + cast_type (bool, optional): _description_. Defaults to False. """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] @@ -134,18 +124,21 @@ def bgmv_expand_slice( assert lora_b_weights.is_contiguous() - N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size # TODO tuning this config + + N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size BLOCK_N = 512 BLOCK_K = triton.next_power_of_2(K) + SPLIT_N = 8 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ + SPLIT_N, batchs, ] _bgmv_expand_slice_kernel[grid]( @@ -165,6 +158,7 @@ def bgmv_expand_slice( slice_offset, BLOCK_N, BLOCK_K, + SPLIT_N, ADD_INPUTS, CAST_TYPE, ) diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index ed208796633a..5495e6f54353 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -41,7 +41,7 @@ def _bgmv_shrink_kernel( a_ptr = input_ptr + cur_batch * xm_stride b_ptr = lora_ptr + l0_stride * lora_index rank_mask = offset_n[:, None] < N - accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32) + accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32) for k in range(0, K, BLOCK_K * SPLIT_K): current_k = k + offset_k # vector load @@ -54,9 +54,8 @@ def _bgmv_shrink_kernel( b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K) tiled_b = tl.load( - b_ptr - + offset_n[:, None] * lora_k_stride - + current_k[None, :] * lora_n_stride, + b_ptr + offset_n[:, None] * lora_k_stride + + current_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 41e65d2a15d4..72ed81bcbbd3 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel( offset_k = tl.arange(0, BLOCK_K) ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride, ) b_ptr = (lora_ptr + l0_stride * lora_index + diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index d27bcd15880b..3dd48a8bafac 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -60,18 +60,10 @@ def _sgmv_shrink_kernel( ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - a_ptr = ( - input_ptr - + cur_seq_start * xm_stride - + ram[:, None] * xm_stride - + offset_k[None, :] * xk_stride - ) - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + rbn[None, :] * lora_k_stride - + offset_k[:, None] * lora_n_stride - ) + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + + offset_k[None, :] * xk_stride) + b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride + + offset_k[:, None] * lora_n_stride) accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): @@ -80,12 +72,12 @@ def _sgmv_shrink_kernel( tiled_b = tl.load(b_ptr) else: k_remaining = K - k * (BLOCK_K * SPLIT_K) - tiled_a = tl.load( - a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0 - ) - tiled_b = tl.load( - b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0 - ) + tiled_a = tl.load(a_ptr, + mask=offset_k[None, :] < k_remaining, + other=0.0) + tiled_b = tl.load(b_ptr, + mask=offset_k[:, None] < k_remaining, + other=0.0) accumulator += tl.dot(tiled_a, tiled_b) a_ptr += BLOCK_K * SPLIT_K * xk_stride @@ -93,14 +85,10 @@ def _sgmv_shrink_kernel( offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - c_ptr = ( - out_ptr - + offset_cm[:, None] * cm_stride - + offset_cn[None, :] * cn_stride - ) - c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( - offset_cn[None, :] < N - ) + c_ptr = (out_ptr + offset_cm[:, None] * cm_stride + + offset_cn[None, :] * cn_stride) + c_mask = (offset_cm[:, None] < + (cur_seq_start + M)) & (offset_cn[None, :] < N) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: @@ -172,7 +160,7 @@ def sgmv_shrink( SPLIT_K, batchs, ] - + # grid = lambda META: (triton.cdiv(max_seq_length, META[ # 'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[ # 'SPLIT_K'],batchs) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 4f4fccca8051..f3ebc29ecfea 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -6,7 +6,10 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_shrink import bgmv_shrink from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice +from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice def _raise_import_error(e): @@ -164,6 +167,7 @@ def add_lora_triton( max_length: int, layer_idx: int, scale: float, + is_prefilling: bool, *, buffer: Optional[torch.Tensor] = None, ): @@ -175,6 +179,49 @@ def add_lora_triton( buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) + if is_prefilling: + _lora_sgmv( + y, + x, + wa_t_all, + wb_t_all, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + layer_idx, + scale, + buffer=buffer, + ) + else: + _lora_bgmv( + y, + x, + wa_t_all, + wb_t_all, + lora_indices_tensor, + batch_size, + layer_idx, + scale, + buffer=buffer, + ) + + +def _lora_sgmv( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batch_size: int, + max_length: int, + layer_idx: int, + scale: float, + buffer: torch.Tensor, +): sgmv_shrink( x, wa_t_all, @@ -199,6 +246,26 @@ def add_lora_triton( ) +def _lora_bgmv( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batch_size: int, + layer_idx: int, + scale: float, + buffer: torch.Tensor, +): + bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale) + bgmv_expand(buffer, + wb_t_all, + y, + lora_indices_tensor, + batch_size, + add_inputs=True) + + def add_lora_slice( y: torch.Tensor, x: torch.Tensor, @@ -288,6 +355,7 @@ def add_lora_triton_slice( scale: float, y_offset: int, y_slice_size: int, + is_prefilling: bool, *, buffer: Optional[torch.Tensor] = None, ): @@ -315,7 +383,7 @@ def add_lora_triton_slice( scale: Scaling factor. y_offset: Offset to apply to the starting column of y. y_slice_size: Size of the y column slice. - # """ + #""" # try: # import vllm._punica_C as punica_kernels # except ImportError as e: @@ -329,6 +397,23 @@ def add_lora_triton_slice( buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) + if is_prefilling: + _lora_sgmv_nslice(y, x, wa_t_all, wb_t_all, b_seq_start_tensor, + seq_length_tensor, lora_indices_tensor, batch_size, + max_length, layer_idx, scale, y_offset, y_slice_size, + buffer) + else: + _lora_bgmv_nslice(y, x, wa_t_all, wb_t_all, lora_indices_tensor, + batch_size, layer_idx, scale, y_offset, y_slice_size, + buffer) + + +def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, batch_size: int, + max_length: int, layer_idx: int, scale: float, + y_offset: int, y_slice_size: int, buffer): sgmv_shrink( x, wa_t_all, @@ -353,3 +438,19 @@ def add_lora_triton_slice( y_slice_size, add_inputs=True, ) + + +def _lora_bgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, batch_size: int, + layer_idx: int, scale: float, y_offset: int, + y_slice_size: int, buffer): + bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale) + bgmv_expand_slice(buffer, + wb_t_all, + y, + lora_indices_tensor, + y_offset, + y_slice_size, + batch_size, + add_inputs=True) From ab85bb54f30e786901a81c555e43b13293e62700 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 5 Jun 2024 00:58:36 +0800 Subject: [PATCH 14/71] add bgmv --- vllm/lora/layers.py | 69 ++++++++++++++---------------- vllm/lora/ops/bgmv_expand_slice.py | 2 +- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ba7f52ff2fb1..b2605bf96b21 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -127,10 +127,12 @@ def _apply_lora_triton( batch_size = batch_mlen_stage_lst[0] max_length = batch_mlen_stage_lst[1] is_prefilling = bool(batch_mlen_stage_lst[2]) - + # maybe we need not restrict range to [:batch_size] add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, - b_seq_start_tensor, seq_length_tensor, lora_index_tensor, - batch_size, max_length, 0, 1.0, is_prefilling) + b_seq_start_tensor[:batch_size], + seq_length_tensor[:batch_size], + lora_index_tensor[:batch_size], batch_size, max_length, 0, + 1.0, is_prefilling) return output.view_as(org_output) @@ -211,11 +213,11 @@ def _apply_lora_triton_nslice( offset_left = 0 #TODO fuse these kernel for slice_idx in range(len(output_slices)): - add_lora_triton_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], b_seq_start_tensor, - seq_length_tensor, lora_index_tensor, batch_size, - max_length, 0, 1.0, offset_left, - output_slices[slice_idx], is_prefilling) + add_lora_triton_slice( + output, x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], + b_seq_start_tensor[:batch_size], seq_length_tensor[:batch_size], + lora_index_tensor[:batch_size], batch_size, max_length, 0, 1.0, + offset_left, output_slices[slice_idx], is_prefilling) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -554,13 +556,9 @@ def set_mapping( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - batch_size = self.batch_mlen_stage_lst[0] - # maybe we need not restrict range to [:batch_size] _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batch_size], - self.seq_length_tensor[:batch_size], - self.indices[:batch_size], - self.batch_mlen_stage_lst, output) + self.b_seq_start_tensor, self.seq_length_tensor, + self.indices, self.batch_mlen_stage_lst, output) return output def forward(self, input_): @@ -722,14 +720,13 @@ def apply(self, x: torch.Tensor, # output, # (self.output_dim, self.output_dim), # ) - batch_size = self.batch_mlen_stage_lst[0] _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batch_size], - self.seq_length_tensor[:batch_size], - self.indices[:batch_size], + self.b_seq_start_tensor, + self.seq_length_tensor, + self.indices, self.batch_mlen_stage_lst, output, (self.output_dim, self.output_dim), @@ -998,14 +995,13 @@ def apply(self, x: torch.Tensor, # output, # self.output_slices, # ) - batch_size = self.batch_mlen_stage_lst[0] _apply_lora_triton_nslice( x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batch_size], - self.seq_length_tensor[:batch_size], - self.indices[:batch_size], + self.b_seq_start_tensor, + self.seq_length_tensor, + self.indices, self.batch_mlen_stage_lst, output, self.output_slices, @@ -1121,13 +1117,10 @@ def set_mapping( def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - batch_size = self.batch_mlen_stage_lst[0] # maybe we need not restrict range to [:batch_size] _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor[:batch_size], - self.seq_length_tensor[:batch_size], - self.indices[:batch_size], - self.batch_mlen_stage_lst, output) + self.b_seq_start_tensor, self.seq_length_tensor, + self.indices, self.batch_mlen_stage_lst, output) return output # def apply(self, x: torch.Tensor) -> torch.Tensor: @@ -1373,17 +1366,17 @@ def _get_logits( logits, ) - # batch_size=self.batch_mlen_stage_lst[0] - # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked, - # self.b_seq_start_tensor[:batch_size], - # self.seq_length_tensor[:batch_size], - # self.indices[:self.indices_len[1]], - # self.batch_mlen_stage_lst, logits_temp) - # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2) - # if flag: - # print("pass") - # else: - # print("error") + logits_temp = logits.clone() + _apply_lora_triton(hidden_states, self.lora_a_stacked, + self.lora_b_stacked, self.b_seq_start_tensor, + self.seq_length_tensor, + self.indices[:self.indices_len[1]], + self.batch_mlen_stage_lst, logits_temp) + flag = torch.allclose(logits_temp, logits, rtol=1e-2, atol=1e-2) + if flag: + print("pass") + else: + print("error") # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 63dc3cabb5a9..5e30312c7e18 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -71,7 +71,7 @@ def _bgmv_expand_slice_kernel( accumulator = tl.sum(tiled_a * tiled_b, 1) c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + - slice_offset * cn_stride +current_n * cn_stride) + slice_offset * cn_stride + current_n * cn_stride) c_mask = current_n < split_n_length if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) From f99b3d27ae454a26caa61a943900ae682cfcf4cb Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 5 Jun 2024 09:33:31 +0800 Subject: [PATCH 15/71] repalcing punica completed --- vllm/lora/layers.py | 143 +-------------- vllm/lora/punica.py | 437 ++++++++++++++++++++++---------------------- 2 files changed, 225 insertions(+), 355 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index b2605bf96b21..ff922a14d879 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -16,8 +16,8 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -from vllm.lora.punica import (add_lora, add_lora_triton, add_lora_slice, - add_lora_triton_slice, bgmv) +from vllm.lora.punica import (add_lora_triton, + add_lora_triton_slice) from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -63,38 +63,6 @@ def dec(*args, **kwargs): return dec - -def _apply_lora( - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - indices: torch.Tensor, - output: torch.Tensor, -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: (num_loras, lora_rank, hidden_dim) - lora_b_stacked: (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - buffer = add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, - 1.0) - return buffer, output.view_as(org_output) - - def _apply_lora_triton( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -136,47 +104,6 @@ def _apply_lora_triton( return output.view_as(org_output) -def _apply_lora_packed_nslice( - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - This method is used for layers that are composed of multiple sublayers - (slices) packed together. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: 3 element tuple of (num_loras, lora_rank, hidden_dim) - lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), - where n is number of slices - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - offset_left = 0 - for slice_idx in range(len(output_slices)): - add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, - output_slices[slice_idx]) - offset_left += output_slices[slice_idx] - return output.view_as(org_output) - - def _apply_lora_triton_nslice( x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], @@ -711,15 +638,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # output_temp=output.clone() - # _apply_lora_packed_nslice( - # x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.indices[:self.indices_len[0]], - # output, - # (self.output_dim, self.output_dim), - # ) _apply_lora_triton_nslice( x, self.lora_a_stacked, @@ -731,11 +649,6 @@ def apply(self, x: torch.Tensor, output, (self.output_dim, self.output_dim), ) - # flag=torch.allclose(output,output_temp,1e-2,1e-2) - # if flag: - # print("pass") - # else: - # print() return output @classmethod @@ -987,14 +900,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # _apply_lora_packed_nslice( - # x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.indices[:self.indices_len[0]], - # output, - # self.output_slices, - # ) _apply_lora_triton_nslice( x, self.lora_a_stacked, @@ -1123,31 +1028,6 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: self.indices, self.batch_mlen_stage_lst, output) return output - # def apply(self, x: torch.Tensor) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x) - # temp_output = output.clone() - # output2 = output.clone() - # mid_buffer,_=_apply_lora( - # x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.indices[:self.indices_len[0]], - # output, - # ) - # batch_size = self.batch_mlen_stage_lst[0] - # mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, - # self.lora_b_stacked, - # self.b_seq_start_tensor[:batch_size], - # self.seq_length_tensor[:batch_size], - # self.indices[:batch_size], - # self.batch_mlen_stage_lst, output) - # flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2) - # # if not flag: - # # print("error") - # # else: - # # print("pass") - # return temp_output - def forward(self, input_): """Forward of RowParallelLinear @@ -1358,25 +1238,14 @@ def _get_logits( self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]] = lora_logits - _apply_lora( - hidden_states, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[1]], - logits, - ) - - logits_temp = logits.clone() + batch_mlen_stage_lst=self.batch_mlen_stage_lst.copy() + # LogitsProcessorWithLoRA always using bgmv + batch_mlen_stage_lst[2]=False _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor, self.seq_length_tensor, self.indices[:self.indices_len[1]], - self.batch_mlen_stage_lst, logits_temp) - flag = torch.allclose(logits_temp, logits, rtol=1e-2, atol=1e-2) - if flag: - print("pass") - else: - print("error") + batch_mlen_stage_lst, logits) # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index f3ebc29ecfea..90ce268c903b 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -12,148 +12,225 @@ from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -def _raise_import_error(e): - if torch.cuda.get_device_capability() < (8, 0): - raise ImportError( - "punica LoRA kernels require compute capability >= 8.0") from e - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") from e - - -def bgmv( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight - matrices. - indicies: Shape: `[B]`. Indices of the weight matrices. - layer_idx: Layer index of the weight matrices. - scale: Scaling factor. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) - - -def dispatch_bgmv_low_level( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, -): - """ - Same as `bgmv` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of - all of the transposed LoRA matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - punica_kernels.dispatch_bgmv_low_level( - y, - x, - w_t_all, - indicies, - layer_idx, - scale, - x.size(1), - y_slice_size, - y_offset, - ) - - -def add_lora( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None, -): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - buffer: Optional. Shape: `[B, R]`. Temporary buffer. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) - punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, - scale) - +# def _raise_import_error(e): +# if torch.cuda.get_device_capability() < (8, 0): +# raise ImportError( +# "punica LoRA kernels require compute capability >= 8.0") from e +# else: +# raise ImportError( +# "punica LoRA kernels could not be imported. If you built vLLM " +# "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " +# "was set.") from e + + +# def bgmv( +# y: torch.Tensor, +# x: torch.Tensor, +# w_t_all: torch.Tensor, +# indicies: torch.LongTensor, +# layer_idx: int, +# scale: float, +# ): +# """ +# Semantics: +# y[i] += ( +# x[i].unsqueeze(0) +# @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# * scale +# ).squeeze(0) + +# Args: +# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. +# x: Shape: `[B, H1]`. Input vectors. +# w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight +# matrices. +# indicies: Shape: `[B]`. Indices of the weight matrices. +# layer_idx: Layer index of the weight matrices. +# scale: Scaling factor. +# """ +# try: +# import vllm._punica_C as punica_kernels +# except ImportError as e: +# _raise_import_error(e) + +# punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) + + +# def dispatch_bgmv_low_level( +# y: torch.Tensor, +# x: torch.Tensor, +# w_t_all: torch.Tensor, +# indicies: torch.LongTensor, +# layer_idx: int, +# scale: float, +# y_offset: int, +# y_slice_size: int, +# ): +# """ +# Same as `bgmv` but you can operate on slices of y. +# Pass whole y, define y_offset and y_slice_size. + +# Semantics: +# y[i] += ( +# x[i].unsqueeze(0) +# @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# * scale +# ).squeeze(0) + +# Args: +# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. +# x: Shape: `[B, H1]`. Input vectors. +# w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of +# all of the transposed LoRA matrices. +# indicies: Shape: `[B]`. Indices of the LoRA weights. +# layer_idx: Layer index of LoRA weights. +# scale: Scaling factor. +# y_offset: Offset to apply to the starting column of y. +# y_slice_size: Size of the y column slice. +# """ +# try: +# import vllm._punica_C as punica_kernels +# except ImportError as e: +# _raise_import_error(e) +# punica_kernels.dispatch_bgmv_low_level( +# y, +# x, +# w_t_all, +# indicies, +# layer_idx, +# scale, +# x.size(1), +# y_slice_size, +# y_offset, +# ) + + +# def add_lora( +# y: torch.Tensor, +# x: torch.Tensor, +# wa_t_all: torch.Tensor, +# wb_t_all: torch.Tensor, +# indicies: torch.LongTensor, +# layer_idx: int, +# scale: float, +# *, +# buffer: Optional[torch.Tensor] = None, +# ): +# """ +# Semantics: +# y[i] += ( +# x[i].unsqueeze(0) +# @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# * scale +# ).squeeze(0) + +# Args: +# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. +# x: Shape: `[B, H1]`. Input vectors. +# wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed +# LoRA A matrices. +# wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed +# LoRA B matrices. +# indicies: Shape: `[B]`. Indices of the LoRA weights. +# layer_idx: Layer index of LoRA weights. +# scale: Scaling factor. +# buffer: Optional. Shape: `[B, R]`. Temporary buffer. +# """ +# try: +# import vllm._punica_C as punica_kernels +# except ImportError as e: +# _raise_import_error(e) + +# r = wb_t_all.size(-1) +# if buffer is None: +# # We set the buffer to be float32 by default to avoid +# # numerical inaccuracies that would otherwise happen +# # due to downcasting. +# buffer = torch.zeros((x.size(0), r), +# dtype=torch.float32, +# device=x.device) +# punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) +# punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, +# scale) + + + + + +# def add_lora_slice( +# y: torch.Tensor, +# x: torch.Tensor, +# wa_t_all: torch.Tensor, +# wb_t_all: torch.Tensor, +# indicies: torch.LongTensor, +# layer_idx: int, +# scale: float, +# y_offset: int, +# y_slice_size: int, +# *, +# buffer: Optional[torch.Tensor] = None, +# ): +# """ +# Same as `add_lora` but you can operate on slices of y. +# Pass whole y, define y_offset and y_slice_size. + +# Semantics: +# y[i] += ( +# x[i].unsqueeze(0) +# @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) +# * scale +# ).squeeze(0) + +# Args: +# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. +# x: Shape: `[B, H1]`. Input vectors. +# wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed +# LoRA A matrices. +# wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed +# LoRA B matrices. +# indicies: Shape: `[B]`. Indices of the LoRA weights. +# layer_idx: Layer index of LoRA weights. +# scale: Scaling factor. +# y_offset: Offset to apply to the starting column of y. +# y_slice_size: Size of the y column slice. +# """ +# try: +# import vllm._punica_C as punica_kernels +# except ImportError as e: +# _raise_import_error(e) + +# r = wb_t_all.size(-1) +# if buffer is None: +# # We set the buffer to be float32 by default to avoid +# # numerical inaccuracies that would otherwise happen +# # due to downcasting. +# buffer = torch.zeros((x.size(0), r), +# dtype=torch.float32, +# device=x.device) +# punica_kernels.dispatch_bgmv_low_level( +# buffer, +# x, +# wa_t_all, +# indicies, +# layer_idx, +# 1.0, +# x.size(1), +# buffer.size(1), +# 0, +# ) +# punica_kernels.dispatch_bgmv_low_level( +# y, +# buffer, +# wb_t_all, +# indicies, +# layer_idx, +# scale, +# buffer.size(1), +# y_slice_size, +# y_offset, +# ) def add_lora_triton( y: torch.Tensor, @@ -265,82 +342,6 @@ def _lora_bgmv( batch_size, add_inputs=True) - -def add_lora_slice( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None, -): - """ - Same as `add_lora` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv_low_level( - buffer, - x, - wa_t_all, - indicies, - layer_idx, - 1.0, - x.size(1), - buffer.size(1), - 0, - ) - punica_kernels.dispatch_bgmv_low_level( - y, - buffer, - wb_t_all, - indicies, - layer_idx, - scale, - buffer.size(1), - y_slice_size, - y_offset, - ) - - def add_lora_triton_slice( y: torch.Tensor, x: torch.Tensor, From ef8e83a6bdae21b829eeda0e3406886c2751d49f Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 5 Jun 2024 13:25:02 +0800 Subject: [PATCH 16/71] fix bug --- vllm/lora/ops/bgmv_expand_slice.py | 5 +- vllm/lora/ops/sgmv_shrink.py | 6 - vllm/lora/punica.py | 438 ++++++++++++++--------------- 3 files changed, 221 insertions(+), 228 deletions(-) diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 5e30312c7e18..262f7669e0a2 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -93,15 +93,14 @@ def bgmv_expand_slice( """ Args: inputs (torch.Tensor): input tensor - lora_b_weights (torch.Tensor): lora'a weight + lora_b_weights (torch.Tensor): lora'b weight output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size + batchs (int): batch size add_inputs (bool, optional): _description_. Defaults to False. - cast_type (bool, optional): _description_. Defaults to False. """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 3dd48a8bafac..518cf70bbf12 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -144,12 +144,6 @@ def sgmv_shrink( assert output_tensor.is_contiguous() # TODO tuning this config N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - # BLOCK_M = config.get("BLOCK_M", 32) - # BLOCK_N = config.get("BLOCK_N", 32) - # BLOCK_K = config.get("BLOCK_K", 32) - # SPLIT_K = config.get("SPLIT_K", 16) - # num_warps = config.get("num_warps", 4) - # num_stages = config.get("num_stages", 3) BLOCK_M = 32 BLOCK_N = 16 BLOCK_K = 32 diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 90ce268c903b..ad48abf9bb9c 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -12,225 +12,225 @@ from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -# def _raise_import_error(e): -# if torch.cuda.get_device_capability() < (8, 0): -# raise ImportError( -# "punica LoRA kernels require compute capability >= 8.0") from e -# else: -# raise ImportError( -# "punica LoRA kernels could not be imported. If you built vLLM " -# "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " -# "was set.") from e - - -# def bgmv( -# y: torch.Tensor, -# x: torch.Tensor, -# w_t_all: torch.Tensor, -# indicies: torch.LongTensor, -# layer_idx: int, -# scale: float, -# ): -# """ -# Semantics: -# y[i] += ( -# x[i].unsqueeze(0) -# @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# * scale -# ).squeeze(0) - -# Args: -# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. -# x: Shape: `[B, H1]`. Input vectors. -# w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight -# matrices. -# indicies: Shape: `[B]`. Indices of the weight matrices. -# layer_idx: Layer index of the weight matrices. -# scale: Scaling factor. -# """ -# try: -# import vllm._punica_C as punica_kernels -# except ImportError as e: -# _raise_import_error(e) - -# punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) - - -# def dispatch_bgmv_low_level( -# y: torch.Tensor, -# x: torch.Tensor, -# w_t_all: torch.Tensor, -# indicies: torch.LongTensor, -# layer_idx: int, -# scale: float, -# y_offset: int, -# y_slice_size: int, -# ): -# """ -# Same as `bgmv` but you can operate on slices of y. -# Pass whole y, define y_offset and y_slice_size. - -# Semantics: -# y[i] += ( -# x[i].unsqueeze(0) -# @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# * scale -# ).squeeze(0) - -# Args: -# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. -# x: Shape: `[B, H1]`. Input vectors. -# w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of -# all of the transposed LoRA matrices. -# indicies: Shape: `[B]`. Indices of the LoRA weights. -# layer_idx: Layer index of LoRA weights. -# scale: Scaling factor. -# y_offset: Offset to apply to the starting column of y. -# y_slice_size: Size of the y column slice. -# """ -# try: -# import vllm._punica_C as punica_kernels -# except ImportError as e: -# _raise_import_error(e) -# punica_kernels.dispatch_bgmv_low_level( -# y, -# x, -# w_t_all, -# indicies, -# layer_idx, -# scale, -# x.size(1), -# y_slice_size, -# y_offset, -# ) - - -# def add_lora( -# y: torch.Tensor, -# x: torch.Tensor, -# wa_t_all: torch.Tensor, -# wb_t_all: torch.Tensor, -# indicies: torch.LongTensor, -# layer_idx: int, -# scale: float, -# *, -# buffer: Optional[torch.Tensor] = None, -# ): -# """ -# Semantics: -# y[i] += ( -# x[i].unsqueeze(0) -# @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# * scale -# ).squeeze(0) - -# Args: -# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. -# x: Shape: `[B, H1]`. Input vectors. -# wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed -# LoRA A matrices. -# wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed -# LoRA B matrices. -# indicies: Shape: `[B]`. Indices of the LoRA weights. -# layer_idx: Layer index of LoRA weights. -# scale: Scaling factor. -# buffer: Optional. Shape: `[B, R]`. Temporary buffer. -# """ -# try: -# import vllm._punica_C as punica_kernels -# except ImportError as e: -# _raise_import_error(e) - -# r = wb_t_all.size(-1) -# if buffer is None: -# # We set the buffer to be float32 by default to avoid -# # numerical inaccuracies that would otherwise happen -# # due to downcasting. -# buffer = torch.zeros((x.size(0), r), -# dtype=torch.float32, -# device=x.device) -# punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) -# punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, -# scale) - - - - - -# def add_lora_slice( -# y: torch.Tensor, -# x: torch.Tensor, -# wa_t_all: torch.Tensor, -# wb_t_all: torch.Tensor, -# indicies: torch.LongTensor, -# layer_idx: int, -# scale: float, -# y_offset: int, -# y_slice_size: int, -# *, -# buffer: Optional[torch.Tensor] = None, -# ): -# """ -# Same as `add_lora` but you can operate on slices of y. -# Pass whole y, define y_offset and y_slice_size. - -# Semantics: -# y[i] += ( -# x[i].unsqueeze(0) -# @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) -# * scale -# ).squeeze(0) - -# Args: -# y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. -# x: Shape: `[B, H1]`. Input vectors. -# wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed -# LoRA A matrices. -# wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed -# LoRA B matrices. -# indicies: Shape: `[B]`. Indices of the LoRA weights. -# layer_idx: Layer index of LoRA weights. -# scale: Scaling factor. -# y_offset: Offset to apply to the starting column of y. -# y_slice_size: Size of the y column slice. -# """ -# try: -# import vllm._punica_C as punica_kernels -# except ImportError as e: -# _raise_import_error(e) - -# r = wb_t_all.size(-1) -# if buffer is None: -# # We set the buffer to be float32 by default to avoid -# # numerical inaccuracies that would otherwise happen -# # due to downcasting. -# buffer = torch.zeros((x.size(0), r), -# dtype=torch.float32, -# device=x.device) -# punica_kernels.dispatch_bgmv_low_level( -# buffer, -# x, -# wa_t_all, -# indicies, -# layer_idx, -# 1.0, -# x.size(1), -# buffer.size(1), -# 0, -# ) -# punica_kernels.dispatch_bgmv_low_level( -# y, -# buffer, -# wb_t_all, -# indicies, -# layer_idx, -# scale, -# buffer.size(1), -# y_slice_size, -# y_offset, -# ) +def _raise_import_error(e): + if torch.cuda.get_device_capability() < (8, 0): + raise ImportError( + "punica LoRA kernels require compute capability >= 8.0") from e + else: + raise ImportError( + "punica LoRA kernels could not be imported. If you built vLLM " + "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " + "was set.") from e + + +def bgmv( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight + matrices. + indicies: Shape: `[B]`. Indices of the weight matrices. + layer_idx: Layer index of the weight matrices. + scale: Scaling factor. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) + + +def dispatch_bgmv_low_level( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, +): + """ + Same as `bgmv` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of + all of the transposed LoRA matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + y_offset: Offset to apply to the starting column of y. + y_slice_size: Size of the y column slice. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + punica_kernels.dispatch_bgmv_low_level( + y, + x, + w_t_all, + indicies, + layer_idx, + scale, + x.size(1), + y_slice_size, + y_offset, + ) + + +def add_lora( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None, +): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + buffer: Optional. Shape: `[B, R]`. Temporary buffer. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) + punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, + scale) + + + + + +def add_lora_slice( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None, +): + """ + Same as `add_lora` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + y_offset: Offset to apply to the starting column of y. + y_slice_size: Size of the y column slice. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv_low_level( + buffer, + x, + wa_t_all, + indicies, + layer_idx, + 1.0, + x.size(1), + buffer.size(1), + 0, + ) + punica_kernels.dispatch_bgmv_low_level( + y, + buffer, + wb_t_all, + indicies, + layer_idx, + scale, + buffer.size(1), + y_slice_size, + y_offset, + ) def add_lora_triton( y: torch.Tensor, From f75ce8686ca933421d8d28932fba347b0bed3ffe Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 6 Jun 2024 00:28:53 +0800 Subject: [PATCH 17/71] optimize kernel --- vllm/lora/ops/bgmv_expand.py | 61 +++++++++++++++++------------ vllm/lora/ops/bgmv_expand_slice.py | 62 +++++++++++++++++------------- vllm/lora/ops/sgmv_shrink.py | 2 +- 3 files changed, 73 insertions(+), 52 deletions(-) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 6132b6047997..888fa537a7c4 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -28,6 +28,7 @@ def _bgmv_expand_kernel( BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, SPLIT_N: tl.constexpr, + EVEN_K: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): @@ -41,41 +42,49 @@ def _bgmv_expand_kernel( return offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) - tiled_a = tl.load( - input_ptr + cur_batch * xm_stride + offset_k * xk_stride, - mask=offset_k < K, - other=0, - ) # [BLOCK_K] + if EVEN_K: + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + ) # [BLOCK_K] + else: + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + mask=offset_k < K, + other=0, + ) # [BLOCK_K] split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - b_ptr = (lora_ptr + l0_stride * lora_index + - pid_sn * split_n_length * lora_k_stride) + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride + ) + c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] - < K) - + b_ptr_mask = (current_n[:, None] < split_n_length) & ( + offset_k[None, :] < K + ) + c_mask = current_n < split_n_length tiled_b = tl.load( - b_ptr + current_n_c[:, None] * lora_k_stride + - offset_k[None, :] * lora_n_stride, + b_ptr + + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] - - accumulator = tl.sum(tiled_a * tiled_b, 1) - - c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + - current_n * cn_stride) - c_mask = current_n < split_n_length if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) - accumulator += tiled_out - tl.store(c_ptr, accumulator, mask=c_mask) + tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask) + accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out + else: + accumulator = tl.sum(tiled_a * tiled_b, 1) + + tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask) @torch.inference_mode() @@ -122,14 +131,15 @@ def bgmv_expand( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 512 + BLOCK_N = 128 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 8 + SPLIT_N = 128 + EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ @@ -153,6 +163,7 @@ def bgmv_expand( BLOCK_N, BLOCK_K, SPLIT_N, + EVEN_K, ADD_INPUTS, CAST_TYPE, ) diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 262f7669e0a2..a8fb5719ab95 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -29,6 +29,7 @@ def _bgmv_expand_slice_kernel( BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, SPLIT_N: tl.constexpr, + EVEN_K: tl.constexpr, ADD_INPUTS: tl.constexpr, CAST_TYPE: tl.constexpr, ): @@ -42,41 +43,48 @@ def _bgmv_expand_slice_kernel( return offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) - tiled_a = tl.load( - input_ptr + cur_batch * xm_stride + offset_k * xk_stride, - mask=offset_k < K, - other=0, - ) # [BLOCK_K] + if EVEN_K: + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + ) # [BLOCK_K] + else: + tiled_a = tl.load( + input_ptr + cur_batch * xm_stride + offset_k * xk_stride, + mask=offset_k < K, + other=0, + ) # [BLOCK_K] split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - b_ptr = (lora_ptr + l0_stride * lora_index + - pid_sn * split_n_length * lora_k_stride) + b_ptr = ( + lora_ptr + + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride + ) + c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n - # vector load - current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] - < K) - + b_ptr_mask = (current_n[:, None] < split_n_length) & ( + offset_k[None, :] < K + ) + c_mask = current_n < split_n_length tiled_b = tl.load( - b_ptr + current_n_c[:, None] * lora_k_stride + - offset_k[None, :] * lora_n_stride, + b_ptr + + current_n[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] - accumulator = tl.sum(tiled_a * tiled_b, 1) - - c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + - slice_offset * cn_stride + current_n * cn_stride) - c_mask = current_n < split_n_length if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) - accumulator += tiled_out - tl.store(c_ptr, accumulator, mask=c_mask) + tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask) + accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out + else: + accumulator = tl.sum(tiled_a * tiled_b, 1) + + tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask) @torch.inference_mode() @@ -126,14 +134,15 @@ def bgmv_expand_slice( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 512 + BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 8 + SPLIT_N = 128 + EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ @@ -158,6 +167,7 @@ def bgmv_expand_slice( BLOCK_N, BLOCK_K, SPLIT_N, + EVEN_K, ADD_INPUTS, CAST_TYPE, ) diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 518cf70bbf12..9fc7508c9421 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -148,7 +148,7 @@ def sgmv_shrink( BLOCK_N = 16 BLOCK_K = 32 SPLIT_K = 8 - EVEN_K = K % BLOCK_K == 0 + EVEN_K = K % (BLOCK_K*SPLIT_K) == 0 grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), SPLIT_K, From c0bc06a4e3554207eba7577f9856b1989b9c69cc Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 11 Jun 2024 21:18:18 +0800 Subject: [PATCH 18/71] trigger test --- tests/lora/test_triton_punica.py | 431 +++++++++++++++++++---------- vllm/lora/layers.py | 95 ++++--- vllm/lora/models.py | 2 +- vllm/lora/ops/bgmv_expand.py | 36 +-- vllm/lora/ops/bgmv_expand_slice.py | 38 ++- vllm/lora/ops/bgmv_shrink.py | 19 +- vllm/lora/ops/sgmv_expand.py | 5 +- vllm/lora/ops/sgmv_expand_slice.py | 3 +- vllm/lora/ops/sgmv_shrink.py | 13 +- vllm/lora/punica.py | 73 ++--- 10 files changed, 442 insertions(+), 273 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 6aea2573d962..8f28821a9336 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -1,14 +1,14 @@ -import random - import pytest import torch +import vllm._punica_C as punica_kernels import vllm.lora.punica as punica -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice +from vllm.lora.ops.sgmv_shrink import sgmv_shrink # The current punica kernel supports dimension and adds a dimension of 3424. HIDDEN_SIZES = [ @@ -62,20 +62,26 @@ 128000, 128256, ] -BATCHS = [i for i in range(0, 128, 8)] + +_BATCH_SIZE_ALIGNMENT = 8 + +# vllm support batch size +BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)] + NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] -DTYPES = [torch.half, torch.bfloat16, torch.float32] +DTYPES = [torch.float16,torch.bfloat16] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] SCALES = [0.5] OP_TYPES = ["shrink", "expand"] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] +NSLICES = [2, 3] def assert_close(a, b): rtol, atol = { - torch.float16: (1e-2, 1e-2), - torch.bfloat16: (12e-2, 1e-2), + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), torch.float32: (1e-2, 1e-2), }[a.dtype] torch.testing.assert_close(a, b, rtol=rtol, atol=atol) @@ -101,7 +107,7 @@ def _torch_groupgemm( out_list = [] current_offset = 0 for lora_index, b_length in zip(range(batchs), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] + input_weight = inputs[current_offset : b_length + current_offset, :] current_offset += b_length lora_weight = lora_weights[lora_indices_tensor[lora_index]] result = torch.nn.functional.linear(input_weight, lora_weight) @@ -115,27 +121,29 @@ def _torch_groupgemm( return -def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, - op_type, device): +def _generate_data( + batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device +): if max_length == 1: max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) + seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, ).to(device) total_tokens = seq_len_tensor.sum() if op_type == "shrink": - inputs_tensor = torch.rand((total_tokens, hidden_size), - dtype=dtype).to(device) + inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to( + device + ) lora_weights = torch.rand( (lora_nums, max_rank, hidden_size), # col-major dtype=dtype, ).to(device) # shrink op need atomic_add, so output is initinized by 0 - ref_out_tensor = torch.zeros((total_tokens, max_rank), - dtype=dtype, - device=inputs_tensor.device) + ref_out_tensor = torch.zeros( + (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device + ) # NOTE shrink kernel using torch.float32 as output type our_out_tensor = torch.zeros( (total_tokens, max_rank), @@ -161,15 +169,16 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, # Ensure the same input. our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint(0, - lora_nums - 1 if lora_nums > 1 else 1, - (batchs, )).to(device) + lora_indices_tensor = torch.randint( + 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) + ).to(device) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 for b_id in range(batchs): lora_index = lora_indices_tensor[b_id] - indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() + indices[ + current_offset : current_offset + seq_len_tensor[b_id] + ] = lora_index.item() current_offset += seq_len_tensor[b_id].item() return ( inputs_tensor, @@ -183,89 +192,145 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, ) -# @pytest.mark.skip("work in progress") -@pytest.mark.parametrize("batchs", BATCHS) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_torch( - batchs: int, - num_loras: int, - rank: int, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, +def _generate_data_expand_nslices( + batchs, hidden_size, lora_nums, max_rank, max_length, dtype, nslices, device ): - torch.manual_seed(seed) - if batchs == 0: - batchs += 1 - hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) - hidden_size = HIDDEN_SIZES[hidden_size_index] - if hidden_size > 100000: - hidden_size = hidden_size // 4 # avoid OOM - ( + if max_length == 1: + max_length += 1 + seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights_lst = [] + for _ in range(nslices): + lora_weights_lst.append( + torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device) + ) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand( + (total_tokens, hidden_size * nslices), + dtype=dtype, + device=inputs_tensor.device, + ) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() + + lora_indices_tensor = torch.randint( + 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) + ).to(device) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batchs): + lora_index = lora_indices_tensor[b_id] + indices[ + current_offset : current_offset + seq_len_tensor[b_id] + ] = lora_index.item() + current_offset += seq_len_tensor[b_id].item() + return ( inputs_tensor, - lora_weights, + lora_weights_lst, our_out_tensor, ref_out_tensor, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, - device) # The sequence length is restricted to the range [1, 1024]. - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - scaling, - ) - else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - add_inputs=True, - ) - _torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batchs, - scaling if op_type == "shrink" else 1.0, - op_type, ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) -# @pytest.mark.skip("work in progress") +# @pytest.mark.parametrize("batchs", BATCHS) +# @pytest.mark.parametrize("num_loras", NUM_LORA) +# @pytest.mark.parametrize("rank", MAX_RANKS) +# @pytest.mark.parametrize("scaling", SCALES) +# @pytest.mark.parametrize("dtype", DTYPES) +# @pytest.mark.parametrize("op_type", OP_TYPES) +# @pytest.mark.parametrize("seed", SEED) +# @pytest.mark.parametrize("device", CUDA_DEVICES) +# def test_sgmv_torch( +# batchs: int, +# num_loras: int, +# rank: int, +# scaling: float, +# dtype: torch.dtype, +# op_type: str, +# seed: int, +# device: str, +# ): +# torch.manual_seed(seed) +# torch.set_default_device(device) +# if batchs == 0: +# batchs += 1 +# hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) +# hidden_size = HIDDEN_SIZES[hidden_size_index] +# if hidden_size > 100000: +# hidden_size = hidden_size // 4 # avoid OOM +# ( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# ref_out_tensor, +# b_seq_start_loc, +# lora_indices_tensor, +# seq_len_tensor, +# indices, +# ) = _generate_data( +# batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device +# ) # The sequence length is restricted to the range [1, 1024]. +# max_seq_length = seq_len_tensor.max() +# if isinstance(max_seq_length, tuple): +# max_seq_length = max_seq_length[0].item() +# else: +# max_seq_length = max_seq_length.item() +# if op_type == "shrink": +# sgmv_shrink( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# b_seq_start_loc, +# seq_len_tensor, +# lora_indices_tensor, +# batchs, +# max_seq_length, +# scaling, +# ) +# else: +# sgmv_expand( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# b_seq_start_loc, +# seq_len_tensor, +# lora_indices_tensor, +# batchs, +# max_seq_length, +# add_inputs=True, +# ) +# _torch_groupgemm( +# ref_out_tensor, +# inputs_tensor, +# lora_weights, +# lora_indices_tensor, +# seq_len_tensor, +# batchs, +# scaling if op_type == "shrink" else 1.0, +# op_type, +# ) +# if op_type == "shrink": +# ref_out_tensor = ref_out_tensor.to(torch.float32) +# assert_close(our_out_tensor, ref_out_tensor) + + @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @@ -284,9 +349,10 @@ def test_triton_sgmv_punica_bgmv( if dtype == torch.float32 or hidden_size == 3424: return torch.manual_seed(seed) + torch.set_default_device(device) batchs = 4 # Arbitrary values for testing - rank = 16 - seq_len = 333 # Arbitrary values for testing + rank = 16 # Arbitrary values for testing + seq_len = 128 # Arbitrary values for testing num_loras = 8 # Arbitrary values for testing ( inputs_tensor, @@ -297,8 +363,9 @@ def test_triton_sgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) + ) = _generate_data( + batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device + ) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): @@ -362,6 +429,7 @@ def test_triton_bgmv_punica_bgmv( if dtype == torch.float32 or hidden_size == 3424: return torch.manual_seed(seed) + torch.set_default_device(device) if batchs == 0: batchs += 1 rank = 16 @@ -376,8 +444,9 @@ def test_triton_bgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) + ) = _generate_data( + batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device + ) if op_type == "shrink": bgmv_shrink( @@ -409,18 +478,17 @@ def test_triton_bgmv_punica_bgmv( ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) - -@pytest.mark.skip("work in progress") +@pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("nslices", NSLICES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_expand_nslice( - hidden_size, - scaling: float, - dtype: torch.dtype, - op_type: str, +def test_sgmv_expand_slice( + batchs:int, + hidden_size: int, + nslices: int, + dtype: str, seed: int, device: str, ): @@ -428,59 +496,140 @@ def test_sgmv_expand_nslice( if dtype == torch.float32 or hidden_size == 3424: return torch.manual_seed(seed) - batchs = 4 # Arbitrary values for testing - rank = 16 - seq_len = 333 # Arbitrary values for testing - num_loras = 8 # Arbitrary values for testing + torch.set_default_device(device) + max_rank = 16 + lora_nums = 4 + max_length = 128 ( inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, + lora_weights_lst, + our_outputs, + ref_outputs, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) - + ) = _generate_data_expand_nslices( + batchs, + hidden_size, + lora_nums, + max_rank, + max_length, + dtype, + nslices, + device, + ) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() else: max_seq_length = max_seq_length.item() + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + slice_offset, + hidden_size, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + punica_kernels.dispatch_bgmv_low_level( + ref_outputs, + inputs_tensor, + lora_weights_4d, + indices, + 0, + 1.0, + inputs_tensor.size(1), + hidden_size, + slice_offset, + ) + slice_offset += hidden_size + assert_close(our_outputs, ref_outputs) - sgmv_expand_slice( +@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("nslices", NSLICES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_bgmv_expand_slice( + batchs:int, + hidden_size: int, + nslices: int, + dtype: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + torch.set_default_device(device) + max_rank = 64 + lora_nums = 8 + ( inputs_tensor, - lora_weights, - our_out_tensor, + lora_weights_lst, + our_outputs, + ref_outputs, b_seq_start_loc, - seq_len_tensor, lora_indices_tensor, - batchs, - max_seq_length, - 1024, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( - ref_out_tensor, - inputs_tensor, - lora_weights_4d, + seq_len_tensor, indices, - scaling if op_type == "shrink" else 1.0, + ) = _generate_data_expand_nslices( + batchs, + hidden_size, + lora_nums, + max_rank, + 1, + dtype, + nslices, + device, ) - - assert_close(our_out_tensor, ref_out_tensor) + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + lora_indices_tensor, + slice_offset, + hidden_size, + batchs, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + punica_kernels.dispatch_bgmv_low_level( + ref_outputs, + inputs_tensor, + lora_weights_4d, + lora_indices_tensor, + 0, + 1.0, + inputs_tensor.size(1), + hidden_size, + slice_offset, + ) + slice_offset += hidden_size + assert_close(our_outputs, ref_outputs) if __name__ == "__main__": - test_triton_bgmv_punica_bgmv( - batchs=1, - hidden_size=128, - scaling=0.5, - dtype=torch.float16, - op_type="expand", + test_bgmv_expand_slice( + batchs=256, + hidden_size=3424, + nslices=2, + dtype=torch.bfloat16, seed=0, device="cuda:0", ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ff922a14d879..96b37ab8880c 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -16,9 +16,8 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -from vllm.lora.punica import (add_lora_triton, - add_lora_triton_slice) from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.punica import add_lora_triton, add_lora_triton_slice from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -63,6 +62,7 @@ def dec(*args, **kwargs): return dec + def _apply_lora_triton( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -72,30 +72,40 @@ def _apply_lora_triton( lora_index_tensor: torch.Tensor, batch_mlen_stage_lst: List[int], output: torch.Tensor, -): - # """Applies lora to each input. - - # This method applies all loras to each input. It uses the - # indices vector to determine which lora yields the - # correct output. An index of -1 means no lora should be - # applied. This method adds the final lora results to the - # output. - - # Input shapes: - # x: (batch_size, hidden_dim) - # lora_a_stacked: (num_loras, lora_rank, hidden_dim) - # lora_b_stacked: (num_loras, output_dim, lora_rank) - # indices: (batch_size) - # output: (batch_size, output_dim) - # """ +) -> torch.Tensor: + """Applies lora to each input. This method applies all loras to each + input. It uses the `lora_index_tensor` vector to determine which lora + yields the correct output. An index of -1 means no lora should be + applied. This method adds the final lora results to the output. + + Args: + x (torch.Tensor): (batch_size, hidden_dim) + lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim) + lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) + b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4]. + seq_length_tensor (torch.Tensor): batch_size,). record the sequence + length of the sequences in the batch + lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch + size, maximum seq length, and prefilling stage flag. + output (torch.Tensor): (batch_size, output_dim) + + Returns: + output (torch.Tensor): (batch_size, output_dim) + + """ org_output = output x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - + # batch_size = batch_mlen_stage_lst[0] max_length = batch_mlen_stage_lst[1] is_prefilling = bool(batch_mlen_stage_lst[2]) - # maybe we need not restrict range to [:batch_size] + add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, b_seq_start_tensor[:batch_size], seq_length_tensor[:batch_size], @@ -114,22 +124,31 @@ def _apply_lora_triton_nslice( batch_mlen_stage_lst: List[int], output: torch.Tensor, output_slices: Tuple[int, ...], -): - # """Applies lora to each input. - - # This method applies all loras to each input. It uses the - # indices vector to determine which lora yields the - # correct output. An index of -1 means no lora should be - # applied. This method adds the final lora results to the - # output. - - # Input shapes: - # x: (batch_size, hidden_dim) - # lora_a_stacked: (num_loras, lora_rank, hidden_dim) - # lora_b_stacked: (num_loras, output_dim, lora_rank) - # indices: (batch_size) - # output: (batch_size, output_dim) - # """ +) -> torch.Tensor: + """Applies lora to each input. This method applies all loras to each + input. It uses the `lora_index_tensor` vector to determine which lora + yields the correct output. An index of -1 means no lora should be + applied. This method adds the final lora results to the output. + + Args: + x (torch.Tensor): (batch_size, hidden_dim) + lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim) + lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) + b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4]. + seq_length_tensor (torch.Tensor): batch_size,). record the sequence + length of the sequences in the batch + lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch + size, maximum seq length, and prefilling stage flag. + output_slices (Tuple[int, ...]): Size of each output column + + Returns: + output (torch.Tensor): (batch_size, output_dim) + """ org_output = output x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) @@ -1238,9 +1257,9 @@ def _get_logits( self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]] = lora_logits - batch_mlen_stage_lst=self.batch_mlen_stage_lst.copy() + batch_mlen_stage_lst = self.batch_mlen_stage_lst.copy() # LogitsProcessorWithLoRA always using bgmv - batch_mlen_stage_lst[2]=False + batch_mlen_stage_lst[2] = False _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked, self.b_seq_start_tensor, self.seq_length_tensor, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 438eeff1ff0c..b6c47e599e81 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -451,7 +451,7 @@ def __init__( # element contains batch_size, max_length, 0 or 1. Use 1 for the # prefilling stage and 0 for the decoding stage.The reason for # distinguishing between the prefilling and decoding stage is that - # if we have implemented bgmv, it can be utilized during the decoding + # we had implemented bgmv, it can be utilized during the decoding # stage. self.batch_mlen_stage_lst = [-1] * 3 self._create_lora_modules() diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 888fa537a7c4..8ec26bdb6b83 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -5,9 +5,9 @@ https://arxiv.org/abs/2310.18547 """ +import torch import triton import triton.language as tl -import torch @triton.jit @@ -33,7 +33,8 @@ def _bgmv_expand_kernel( CAST_TYPE: tl.constexpr, ): """ - C=A@B, and B is col-major matrix + GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's + performance """ pid_sn = tl.program_id(axis=0) cur_batch = tl.program_id(axis=1) @@ -43,9 +44,8 @@ def _bgmv_expand_kernel( offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) if EVEN_K: - tiled_a = tl.load( - input_ptr + cur_batch * xm_stride + offset_k * xk_stride, - ) # [BLOCK_K] + tiled_a = tl.load(input_ptr + cur_batch * xm_stride + + offset_k * xk_stride, ) # [BLOCK_K] else: tiled_a = tl.load( input_ptr + cur_batch * xm_stride + offset_k * xk_stride, @@ -57,24 +57,19 @@ def _bgmv_expand_kernel( if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + pid_sn * split_n_length * lora_k_stride - ) + b_ptr = (lora_ptr + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride) c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) - b_ptr_mask = (current_n[:, None] < split_n_length) & ( - offset_k[None, :] < K - ) + b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] + < K) c_mask = current_n < split_n_length tiled_b = tl.load( - b_ptr - + current_n_c[:, None] * lora_k_stride - + offset_k[None, :] * lora_n_stride, + b_ptr + current_n_c[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] @@ -112,7 +107,6 @@ def bgmv_expand( assert lora_b_weights.dtype in [ torch.float16, torch.bfloat16, - torch.float32, ] assert inputs.size(1) == lora_b_weights.size(-1) @@ -131,15 +125,15 @@ def bgmv_expand( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 128 + BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 128 + SPLIT_N = 64 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index a8fb5719ab95..3d41d064ea7c 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -5,9 +5,9 @@ https://arxiv.org/abs/2310.18547 """ +import torch import triton import triton.language as tl -import torch @triton.jit @@ -34,7 +34,8 @@ def _bgmv_expand_slice_kernel( CAST_TYPE: tl.constexpr, ): """ - C=A@B, and B is col-major matrix + GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's + performance """ pid_sn = tl.program_id(axis=0) cur_batch = tl.program_id(axis=1) @@ -44,9 +45,8 @@ def _bgmv_expand_slice_kernel( offset_k = tl.arange(0, BLOCK_K) offset_n = tl.arange(0, BLOCK_N) if EVEN_K: - tiled_a = tl.load( - input_ptr + cur_batch * xm_stride + offset_k * xk_stride, - ) # [BLOCK_K] + tiled_a = tl.load(input_ptr + cur_batch * xm_stride + + offset_k * xk_stride, ) # [BLOCK_K] else: tiled_a = tl.load( input_ptr + cur_batch * xm_stride + offset_k * xk_stride, @@ -58,22 +58,19 @@ def _bgmv_expand_slice_kernel( if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) # sliding to next row-block - b_ptr = ( - lora_ptr - + l0_stride * lora_index - + pid_sn * split_n_length * lora_k_stride - ) - c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + b_ptr = (lora_ptr + l0_stride * lora_index + + pid_sn * split_n_length * lora_k_stride) + c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length + + slice_offset * cn_stride) + for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n - b_ptr_mask = (current_n[:, None] < split_n_length) & ( - offset_k[None, :] < K - ) + b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] + < K) c_mask = current_n < split_n_length tiled_b = tl.load( - b_ptr - + current_n[:, None] * lora_k_stride - + offset_k[None, :] * lora_n_stride, + b_ptr + current_n[:, None] * lora_k_stride + + offset_k[None, :] * lora_n_stride, mask=b_ptr_mask, other=0.0, ) # [BLOCK_N,BLOCK_K] @@ -115,7 +112,6 @@ def bgmv_expand_slice( assert lora_b_weights.dtype in [ torch.float16, torch.bfloat16, - torch.float32, ] assert inputs.size(1) == lora_b_weights.size(-1) assert lora_indices_tensor.size(0) == batchs @@ -136,13 +132,13 @@ def bgmv_expand_slice( N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 128 + SPLIT_N = 64 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False if inputs.dtype == torch.float32 and lora_b_weights.dtype in [ - torch.float16, - torch.bfloat16, + torch.float16, + torch.bfloat16, ]: CAST_TYPE = True grid = [ diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 5495e6f54353..d2166a5593ab 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -5,9 +5,9 @@ https://arxiv.org/abs/2310.18547 """ +import torch import triton import triton.language as tl -import torch @triton.jit @@ -30,6 +30,10 @@ def _bgmv_shrink_kernel( BLOCK_K: tl.constexpr, SPLIT_K: tl.constexpr, ): + """ + GroupGEMV,Additionally, introducing SPLIT-K can improve large hidden_size's + performance + """ pid_sk = tl.program_id(axis=0) cur_batch = tl.program_id(axis=1) lora_index = tl.load(lora_indices + cur_batch) @@ -81,7 +85,6 @@ def bgmv_shrink( scaling: float, ): """ - Args: inputs (torch.Tensor): input tensor lora_a_weights (torch.Tensor): lora'a weight @@ -92,7 +95,11 @@ def bgmv_shrink( scaling (float): Scaling factor. """ assert inputs.dtype == lora_a_weights.dtype - assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert inputs.dtype in [torch.float16, torch.bfloat16] + assert lora_a_weights.dtype in [ + torch.float16, + torch.bfloat16, + ] assert inputs.size(1) == lora_a_weights.size(-1) assert lora_indices_tensor.size(0) == batchs assert inputs.is_contiguous() @@ -106,14 +113,13 @@ def bgmv_shrink( assert output_tensor.is_contiguous() # TODO tuning this config N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - BLOCK_K = 512 + BLOCK_K = 256 BLOCK_N = triton.next_power_of_2(output_tensor.size(1)) - SPLIT_K = 16 + SPLIT_K = 64 grid = [ SPLIT_K, batchs, ] - config = {"num_stages": 4, "num_warps": 8} _bgmv_shrink_kernel[grid]( inputs, lora_a_weights, @@ -132,6 +138,5 @@ def bgmv_shrink( BLOCK_N, BLOCK_K, SPLIT_K, - **config, ) return diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index f2af7be4ad62..f34eec0357bd 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -106,9 +106,7 @@ def sgmv_expand( max_seq_length: int, add_inputs: bool = False, ): - """_summary_ - - + """ Args: inputs (torch.Tensor): input tensor lora_b_weights (torch.Tensor): lora'a weight @@ -132,7 +130,6 @@ def sgmv_expand( assert lora_b_weights.dtype in [ torch.float16, torch.bfloat16, - torch.float32, ] assert inputs.size(1) == lora_b_weights.size(-1) assert b_seq_start_loc.size(0) == batchs diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 72ed81bcbbd3..25975c7ed5fb 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel( offset_k = tl.arange(0, BLOCK_K) ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride, ) b_ptr = (lora_ptr + l0_stride * lora_index + @@ -140,7 +140,6 @@ def sgmv_expand_slice( assert lora_b_weights.dtype in [ torch.float16, torch.bfloat16, - torch.float32, ] assert inputs.size(1) == lora_b_weights.size(-1) assert b_seq_start_loc.size(0) == batchs diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 9fc7508c9421..45aeb9e9fb78 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -118,7 +118,7 @@ def sgmv_shrink( b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative sequence lengths of the sequences in the batch, used to index into sequence. E.g.,if the sequence length is [4, 6], it is - [0, 4, 10]. + [0, 4]. seq_len_tensor (torch.Tensor): (batch_size,). record the sequence length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index @@ -129,7 +129,11 @@ def sgmv_shrink( scaling (float): Scaling factor. """ assert inputs.dtype == lora_a_weights.dtype - assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert inputs.dtype in [torch.float16, torch.bfloat16] + assert lora_a_weights.dtype in [ + torch.float16, + torch.bfloat16, + ] assert inputs.size(1) == lora_a_weights.size(-1) assert b_seq_start_loc.size(0) == batchs assert lora_indices_tensor.size(0) == batchs @@ -148,16 +152,13 @@ def sgmv_shrink( BLOCK_N = 16 BLOCK_K = 32 SPLIT_K = 8 - EVEN_K = K % (BLOCK_K*SPLIT_K) == 0 + EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), SPLIT_K, batchs, ] - # grid = lambda META: (triton.cdiv(max_seq_length, META[ - # 'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[ - # 'SPLIT_K'],batchs) _sgmv_shrink_kernel[grid]( inputs, lora_a_weights, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index ad48abf9bb9c..7a07e73a116c 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,12 +4,12 @@ import torch -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.sgmv_shrink import sgmv_shrink def _raise_import_error(e): @@ -155,9 +155,6 @@ def add_lora( scale) - - - def add_lora_slice( y: torch.Tensor, x: torch.Tensor, @@ -232,6 +229,7 @@ def add_lora_slice( y_offset, ) + def add_lora_triton( y: torch.Tensor, x: torch.Tensor, @@ -248,11 +246,42 @@ def add_lora_triton( *, buffer: Optional[torch.Tensor] = None, ): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + Args: + y (torch.Tensor): (batch_size, output_dim).Will be changed in-place. + x (torch.Tensor): (batch_size, hidden_dim) + wa_t_all (torch.Tensor): (num_loras, lora_rank, hidden_dim) + wb_t_all (torch.Tensor): (num_loras, output_dim, lora_rank) + b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative + sequence lengths of the sequences in the batch, used to index + into sequence. E.g.,if the sequence length is [4, 6], it is + [0, 4]. Used only during the prefilling stage. + seq_length_tensor (torch.Tensor): batch_size,). record the sequence + length of the sequences in the batch. Used only during the + prefilling stage. + lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index + corresponding to each batch + batch_size (int): batch size. Used only during the prefilling stage. + max_length (int): maximum seq length in the batch.Used only during the + prefilling stage. + layer_idx (int): Layer index of LoRA weights. + scale (float): Scaling factor. + is_prefilling (bool): True indicates the prefilling stage, while False + indicates the decoding stage." + buffer (Optional[torch.Tensor], optional): (batch_size,rank) + """ r = wb_t_all.size(-1) if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) @@ -342,6 +371,7 @@ def _lora_bgmv( batch_size, add_inputs=True) + def add_lora_triton_slice( y: torch.Tensor, x: torch.Tensor, @@ -361,30 +391,9 @@ def add_lora_triton_slice( buffer: Optional[torch.Tensor] = None, ): """ - Same as `add_lora` but you can operate on slices of y. + Same as `add_lora_triton` but you can operate on slices of y. Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - #""" + """ # try: # import vllm._punica_C as punica_kernels # except ImportError as e: From a7b53708bf50886e1030810bf4145538d6b9e8a3 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 13 Jun 2024 14:21:42 +0800 Subject: [PATCH 19/71] tuning bgmv --- tests/lora/test_triton_punica.py | 79 +++++++++----------- vllm/lora/ops/bgmv_expand.py | 35 +++++---- vllm/lora/ops/bgmv_expand_slice.py | 37 ++++++---- vllm/lora/ops/bgmv_shrink.py | 27 ++++--- vllm/lora/ops/sgmv_expand_slice.py | 2 +- vllm/lora/ops/utils.py | 57 +++++++++++++++ vllm/lora/punica.py | 114 ++++++++++++++++++----------- 7 files changed, 226 insertions(+), 125 deletions(-) create mode 100644 vllm/lora/ops/utils.py diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 8f28821a9336..a098aba16456 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -69,7 +69,7 @@ BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)] NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] -DTYPES = [torch.float16,torch.bfloat16] +DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] SCALES = [0.5] OP_TYPES = ["shrink", "expand"] @@ -107,7 +107,7 @@ def _torch_groupgemm( out_list = [] current_offset = 0 for lora_index, b_length in zip(range(batchs), seq_len_tensor): - input_weight = inputs[current_offset : b_length + current_offset, :] + input_weight = inputs[current_offset:b_length + current_offset, :] current_offset += b_length lora_weight = lora_weights[lora_indices_tensor[lora_index]] result = torch.nn.functional.linear(input_weight, lora_weight) @@ -121,29 +121,27 @@ def _torch_groupgemm( return -def _generate_data( - batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device -): +def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, + op_type, device): if max_length == 1: max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) + seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, ).to(device) total_tokens = seq_len_tensor.sum() if op_type == "shrink": - inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to( - device - ) + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) lora_weights = torch.rand( (lora_nums, max_rank, hidden_size), # col-major dtype=dtype, ).to(device) # shrink op need atomic_add, so output is initinized by 0 - ref_out_tensor = torch.zeros( - (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device - ) + ref_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=dtype, + device=inputs_tensor.device) # NOTE shrink kernel using torch.float32 as output type our_out_tensor = torch.zeros( (total_tokens, max_rank), @@ -169,16 +167,15 @@ def _generate_data( # Ensure the same input. our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint( - 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) - ).to(device) + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batchs, )).to(device) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 for b_id in range(batchs): lora_index = lora_indices_tensor[b_id] - indices[ - current_offset : current_offset + seq_len_tensor[b_id] - ] = lora_index.item() + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() current_offset += seq_len_tensor[b_id].item() return ( inputs_tensor, @@ -192,12 +189,11 @@ def _generate_data( ) -def _generate_data_expand_nslices( - batchs, hidden_size, lora_nums, max_rank, max_length, dtype, nslices, device -): +def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, + max_length, dtype, nslices, device): if max_length == 1: max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device) + seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, @@ -214,8 +210,7 @@ def _generate_data_expand_nslices( torch.rand( (lora_nums, hidden_size, max_rank), # col-major dtype=dtype, - ).to(device) - ) + ).to(device)) # expand op needs to complete y+=a@lora_b, so output is # initinized randomly ref_out_tensor = torch.rand( @@ -226,16 +221,15 @@ def _generate_data_expand_nslices( # Ensure the same input. our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint( - 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,) - ).to(device) + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batchs, )).to(device) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 for b_id in range(batchs): lora_index = lora_indices_tensor[b_id] - indices[ - current_offset : current_offset + seq_len_tensor[b_id] - ] = lora_index.item() + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() current_offset += seq_len_tensor[b_id].item() return ( inputs_tensor, @@ -363,9 +357,8 @@ def test_triton_sgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device - ) + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): @@ -444,9 +437,8 @@ def test_triton_bgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data( - batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device - ) + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) if op_type == "shrink": bgmv_shrink( @@ -454,7 +446,6 @@ def test_triton_bgmv_punica_bgmv( lora_weights, our_out_tensor, lora_indices_tensor, - batchs, scaling, ) else: @@ -463,7 +454,6 @@ def test_triton_bgmv_punica_bgmv( lora_weights, our_out_tensor, lora_indices_tensor, - batchs, add_inputs=True, ) lora_weights_4d = lora_weights.unsqueeze(dim=1) @@ -478,6 +468,7 @@ def test_triton_bgmv_punica_bgmv( ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) + @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) @@ -485,7 +476,7 @@ def test_triton_bgmv_punica_bgmv( @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_sgmv_expand_slice( - batchs:int, + batchs: int, hidden_size: int, nslices: int, dtype: str, @@ -555,6 +546,7 @@ def test_sgmv_expand_slice( slice_offset += hidden_size assert_close(our_outputs, ref_outputs) + @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) @@ -562,7 +554,7 @@ def test_sgmv_expand_slice( @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_bgmv_expand_slice( - batchs:int, + batchs: int, hidden_size: int, nslices: int, dtype: str, @@ -604,8 +596,7 @@ def test_bgmv_expand_slice( our_outputs, lora_indices_tensor, slice_offset, - hidden_size, - batchs, + slice_size=hidden_size, add_inputs=True, ) lora_weights_4d = lora_weights.unsqueeze(dim=1) @@ -626,8 +617,8 @@ def test_bgmv_expand_slice( if __name__ == "__main__": test_bgmv_expand_slice( - batchs=256, - hidden_size=3424, + batchs=32, + hidden_size=128, nslices=2, dtype=torch.bfloat16, seed=0, diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 8ec26bdb6b83..04fdd670243d 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -8,6 +8,8 @@ import torch import triton import triton.language as tl +from typing import Dict, Optional +from .utils import get_lora_op_configs @triton.jit @@ -88,8 +90,8 @@ def bgmv_expand( lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, - add_inputs: bool = False, + add_inputs: bool = True, + override_config: Optional[Dict[str, int]] = None, ): """ Args: @@ -110,7 +112,6 @@ def bgmv_expand( ] assert inputs.size(1) == lora_b_weights.size(-1) - assert lora_indices_tensor.size(0) == batchs assert inputs.is_contiguous() assert output_tensor.is_contiguous() @@ -125,9 +126,9 @@ def bgmv_expand( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 256 + # BLOCK_N =64 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 64 + # SPLIT_N = 8 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False @@ -136,10 +137,17 @@ def bgmv_expand( torch.bfloat16, ]: CAST_TYPE = True - grid = [ - SPLIT_N, + config = {"BLOCK_N": 64, "SPLIT_N": 8} + batchs = lora_indices_tensor.size(0) + + if override_config: + config = override_config + else: + config = get_lora_op_configs("expand", batchs, N) + grid = lambda META: ( + META["SPLIT_N"], batchs, - ] + ) _bgmv_expand_kernel[grid]( inputs, lora_b_weights, @@ -154,11 +162,10 @@ def bgmv_expand( lora_b_weights.stride(2), output_tensor.stride(0), output_tensor.stride(1), - BLOCK_N, - BLOCK_K, - SPLIT_N, - EVEN_K, - ADD_INPUTS, - CAST_TYPE, + BLOCK_K=BLOCK_K, + EVEN_K=EVEN_K, + ADD_INPUTS=ADD_INPUTS, + CAST_TYPE=CAST_TYPE, + **config, ) return diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 3d41d064ea7c..becaf4f1ca07 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -8,6 +8,8 @@ import torch import triton import triton.language as tl +from typing import Any, Dict, Optional +from .utils import get_lora_op_configs @triton.jit @@ -92,8 +94,8 @@ def bgmv_expand_slice( lora_indices_tensor: torch.Tensor, slice_offset: int, slice_size: int, - batchs: int, - add_inputs: bool = False, + add_inputs: bool = True, + override_config: Optional[Dict[str, int]] = None, ): """ Args: @@ -114,7 +116,7 @@ def bgmv_expand_slice( torch.bfloat16, ] assert inputs.size(1) == lora_b_weights.size(-1) - assert lora_indices_tensor.size(0) == batchs + assert slice_size == lora_b_weights.size(-2) assert inputs.is_contiguous() assert output_tensor.is_contiguous() @@ -130,9 +132,9 @@ def bgmv_expand_slice( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - BLOCK_N = 256 + # BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) - SPLIT_N = 64 + # SPLIT_N = 64 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False @@ -141,10 +143,18 @@ def bgmv_expand_slice( torch.bfloat16, ]: CAST_TYPE = True - grid = [ - SPLIT_N, + + batchs = lora_indices_tensor.size(0) + + if override_config: + config = override_config + else: + config = get_lora_op_configs("expand", batchs, N) + + grid = lambda META: ( + META["SPLIT_N"], batchs, - ] + ) _bgmv_expand_slice_kernel[grid]( inputs, lora_b_weights, @@ -160,11 +170,10 @@ def bgmv_expand_slice( output_tensor.stride(0), output_tensor.stride(1), slice_offset, - BLOCK_N, - BLOCK_K, - SPLIT_N, - EVEN_K, - ADD_INPUTS, - CAST_TYPE, + BLOCK_K=BLOCK_K, + EVEN_K=EVEN_K, + ADD_INPUTS=ADD_INPUTS, + CAST_TYPE=CAST_TYPE, + **config, ) return diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index d2166a5593ab..99b9d7ee5b9f 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -8,6 +8,8 @@ import torch import triton import triton.language as tl +from typing import Dict, Optional +from .utils import get_lora_op_configs @triton.jit @@ -81,8 +83,8 @@ def bgmv_shrink( lora_a_weights: torch.Tensor, output_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, - scaling: float, + scaling: float = 1.0, + override_config: Optional[Dict[str, int]] = None, ): """ Args: @@ -101,7 +103,6 @@ def bgmv_shrink( torch.bfloat16, ] assert inputs.size(1) == lora_a_weights.size(-1) - assert lora_indices_tensor.size(0) == batchs assert inputs.is_contiguous() if lora_a_weights.ndim == 4: # shape:(lora_num,1,rank, size) @@ -112,14 +113,19 @@ def bgmv_shrink( assert lora_a_weights.is_contiguous() assert output_tensor.is_contiguous() # TODO tuning this config + batchs = lora_indices_tensor.size(0) N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - BLOCK_K = 256 BLOCK_N = triton.next_power_of_2(output_tensor.size(1)) - SPLIT_K = 64 - grid = [ - SPLIT_K, + if override_config: + config = override_config + else: + # First try to load optimal config from the file + config = get_lora_op_configs("shrink", batchs, K) + + grid = lambda META: ( + META["SPLIT_K"], batchs, - ] + ) _bgmv_shrink_kernel[grid]( inputs, lora_a_weights, @@ -135,8 +141,7 @@ def bgmv_shrink( lora_a_weights.stride(2), output_tensor.stride(0), output_tensor.stride(1), - BLOCK_N, - BLOCK_K, - SPLIT_K, + BLOCK_N=BLOCK_N, + **config, ) return diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 25975c7ed5fb..2fdedd591032 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel( offset_k = tl.arange(0, BLOCK_K) ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - + a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride + offset_k[None, :] * xk_stride, ) b_ptr = (lora_ptr + l0_stride * lora_index + diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py new file mode 100644 index 000000000000..fb8110b90564 --- /dev/null +++ b/vllm/lora/ops/utils.py @@ -0,0 +1,57 @@ +import functools +import json +import os +from typing import Dict, Optional + + +def _get_config_file_name( + op_type: str, + batchs: int, + hidden_size: int, +) -> str: + # device_name = torch.cuda.get_device_name().replace(" ", "_") + device_name = "NVIDIA_GeForce_RTX_3090" + return ( + f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " + + f"device_name={device_name}.json" + ) + + +@functools.lru_cache +def _get_op_configs( + op_type: str, batch: int, hidden_size: int +) -> Optional[Dict[str, int]]: + FOLDER_NAME = "bgmv_configs" + json_file_name = _get_config_file_name(op_type, batch, hidden_size) + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + FOLDER_NAME, + json_file_name, + ) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + tuned_config = json.load(f).get( + f"batchs={batch},hidden_size={hidden_size}", None + ) + return tuned_config + + # If no optimized configuration is available, return None + return None + + +def _get_default_config(op_type: str, batch: int, hidden_size: int): + if op_type == "expand": + return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8} + else: + return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8} + # raise NotImplementedError + + +def get_lora_op_configs( + op_type: str, batch: int, hidden_size: int +) -> Dict[str, int]: + config = _get_op_configs(op_type, batch, hidden_size) + if not config: + config = _get_default_config(op_type, batch, hidden_size) + return config diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 7a07e73a116c..ba387fc2010f 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -246,7 +246,7 @@ def add_lora_triton( *, buffer: Optional[torch.Tensor] = None, ): - """ + """ Semantics: y[i] += ( x[i].unsqueeze(0) @@ -264,18 +264,18 @@ def add_lora_triton( into sequence. E.g.,if the sequence length is [4, 6], it is [0, 4]. Used only during the prefilling stage. seq_length_tensor (torch.Tensor): batch_size,). record the sequence - length of the sequences in the batch. Used only during the + length of the sequences in the batch. Used only during the prefilling stage. lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch batch_size (int): batch size. Used only during the prefilling stage. - max_length (int): maximum seq length in the batch.Used only during the + max_length (int): maximum seq length in the batch.Used only during the prefilling stage. layer_idx (int): Layer index of LoRA weights. scale (float): Scaling factor. - is_prefilling (bool): True indicates the prefilling stage, while False + is_prefilling (bool): True indicates the prefilling stage, while False indicates the decoding stage." - buffer (Optional[torch.Tensor], optional): (batch_size,rank) + buffer (Optional[torch.Tensor], optional): (batch_size,rank) """ r = wb_t_all.size(-1) if buffer is None: @@ -307,7 +307,6 @@ def add_lora_triton( wa_t_all, wb_t_all, lora_indices_tensor, - batch_size, layer_idx, scale, buffer=buffer, @@ -358,18 +357,12 @@ def _lora_bgmv( wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, - batch_size: int, layer_idx: int, scale: float, buffer: torch.Tensor, ): - bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale) - bgmv_expand(buffer, - wb_t_all, - y, - lora_indices_tensor, - batch_size, - add_inputs=True) + bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale) + bgmv_expand(buffer, wb_t_all, y, lora_indices_tensor, add_inputs=True) def add_lora_triton_slice( @@ -408,22 +401,53 @@ def add_lora_triton_slice( dtype=torch.float32, device=x.device) if is_prefilling: - _lora_sgmv_nslice(y, x, wa_t_all, wb_t_all, b_seq_start_tensor, - seq_length_tensor, lora_indices_tensor, batch_size, - max_length, layer_idx, scale, y_offset, y_slice_size, - buffer) + _lora_sgmv_nslice( + y, + x, + wa_t_all, + wb_t_all, + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + layer_idx, + scale, + y_offset, + y_slice_size, + buffer, + ) else: - _lora_bgmv_nslice(y, x, wa_t_all, wb_t_all, lora_indices_tensor, - batch_size, layer_idx, scale, y_offset, y_slice_size, - buffer) + _lora_bgmv_nslice( + y, + x, + wa_t_all, + wb_t_all, + lora_indices_tensor, + layer_idx, + scale, + y_offset, + y_slice_size, + buffer, + ) -def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, batch_size: int, - max_length: int, layer_idx: int, scale: float, - y_offset: int, y_slice_size: int, buffer): +def _lora_sgmv_nslice( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + b_seq_start_tensor: torch.Tensor, + seq_length_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batch_size: int, + max_length: int, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + buffer, +): sgmv_shrink( x, wa_t_all, @@ -450,17 +474,25 @@ def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, ) -def _lora_bgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, batch_size: int, - layer_idx: int, scale: float, y_offset: int, - y_slice_size: int, buffer): - bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale) - bgmv_expand_slice(buffer, - wb_t_all, - y, - lora_indices_tensor, - y_offset, - y_slice_size, - batch_size, - add_inputs=True) +def _lora_bgmv_nslice( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + buffer, +): + bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale) + bgmv_expand_slice( + buffer, + wb_t_all, + y, + lora_indices_tensor, + y_offset, + y_slice_size, + add_inputs=True, + ) From dc72d7ab821b8c4434d2d8192a400125da12433a Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 13 Jun 2024 14:32:03 +0800 Subject: [PATCH 20/71] add tuning config --- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++ vllm/lora/ops/utils.py | 3 +-- 1151 files changed, 8051 insertions(+), 2 deletions(-) create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b39ab85d9b3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..926c453330ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=10240": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3403b6a8a156 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=102400": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..00a40f0fb282 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=102656": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fe2ef151f545 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=11008": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e72812a699b8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1152": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6af79154d137 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ede75bf5ee4a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1280": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7c18b5d9e89a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128000": { + "BLOCK_N": 256, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..74123059d34c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128256": { + "BLOCK_N": 256, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e6204367ba9f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=13824": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..619d49755fbd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=14336": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..da168958d44b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1536": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b774e5e73509 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=15360": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5df0d12a0066 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2048": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e669eec80db8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=22016": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6f248613276a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2304": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4b800fceca15 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=24576": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97ef5bd49850 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..498985dfa565 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..888779c1a242 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=27392": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90c40f66516a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2752": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..725987ef135d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=27648": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6758c49d6d53 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3072": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..739073148751 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32000": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3ba9089734f5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32256": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..74d73ee28866 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32512": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cd011852520e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32768": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bdb74bae1096 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=33024": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90067aae86ea --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3328": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..266f0b4643a2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3456": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..50eef61c7dc0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3584": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1a0e6bad928a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=36864": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6379489182c5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=4096": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..845e90469c7a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=43264": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6da3f2cdd17e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=4608": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5859f692c3b2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=49152": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a1ed01126386 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..20e62377ef27 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5120": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cfd9a3f149ff --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5504": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7e7ff82dc5f3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5632": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6312c21225d5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6144": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7bda71f1c3e4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6400": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c39485cbc08c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=64000": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b1adfbe01c2c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=64256": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0b1aee061aa7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6848": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3a894b412e3f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6912": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0132c4375421 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=7168": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9ffe008aa83e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=8192": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..838189dba35d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=9216": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5b1da4d44b94 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1024": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c392909217f5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=10240": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2e160c4ae390 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=102400": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..46428cc0a9da --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=102656": { + "BLOCK_N": 512, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..34ff5ebb9fe7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=11008": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1375324c09ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1152": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e8b0e9dbe8b0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..72fc4afd1efe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1280": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97e7d9e7bd0d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128000": { + "BLOCK_N": 1024, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..40a4a9526be0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128256": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ac35eea6297a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=13824": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..38b1819b0120 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=14336": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..668669e9fb4a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1536": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c23e4b555ab4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=15360": { + "BLOCK_N": 512, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..30715168cdd8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2048": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..050f3384e1cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=22016": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3820959d0032 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2304": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..08d8f70e1e7a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=24576": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5cf06550f0b1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5ee401212495 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1d7db0c6a860 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=27392": { + "BLOCK_N": 512, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..77fc2358208c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2752": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b33817c6ecb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=27648": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d62a622342b5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3072": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c7030ad5a673 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32000": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ff76f3c110b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32256": { + "BLOCK_N": 512, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..721b587a948d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32512": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..92932b62f1a1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32768": { + "BLOCK_N": 256, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7cf1394d96bd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=33024": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bec43f2e9cd1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3328": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8dfd12024faa --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3456": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c9fa0757f4d2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3584": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2323a50dfb84 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=36864": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..41e170807720 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=4096": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b04da877902c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=43264": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0df3ef025f97 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=4608": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..07e41596ed86 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=49152": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9013302be01a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..520a85f2e70a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5120": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..11a3940a9d4a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5504": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..43f4baa91a71 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5632": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2a5260ec1d4d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6144": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..962399539ec2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6400": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cbc8f93ce329 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=64000": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95e76f479321 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=64256": { + "BLOCK_N": 512, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b967d91645ed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6848": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6c2f971176df --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6912": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e0203c01009e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=7168": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0db797564e0d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=8192": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ab2faa8a3e47 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=9216": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..99d36f108d24 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5d1797c7df6a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=10240": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3c5a379e0bdc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=102400": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..16f2497bed72 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=102656": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..de9477263adf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=11008": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..58b67d1eb450 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1152": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b20b74c6b65 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..711133cabf41 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1280": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8ae36e752fa0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128000": { + "BLOCK_N": 1024, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9bd2e640b741 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128256": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e9ef08e28930 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=13824": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..02df6114edd3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=14336": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f174cccf6781 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1536": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95398b063d5a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=15360": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a9eb1222067d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2048": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b0376ebedcf0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=22016": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..33a78ee55501 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2304": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5d62f3bc77ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=24576": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bb04a656ec58 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76589c2e4848 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c4f0e1fe7d02 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=27392": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4e3e51669b1a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2752": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76507320d8c9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=27648": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95aefbac204c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3072": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..804c9a7df946 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32000": { + "BLOCK_N": 512, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8c4069b4c997 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32256": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0de6a013f40e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32512": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fbc4f954e962 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32768": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..89dff508fcff --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=33024": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fb48ebaa8f79 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3328": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7603143e1ff2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3456": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f119d16aa81c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3584": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b19e8f096df7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=36864": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e870cd0967a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=4096": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b30bd7d45e40 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=43264": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..29ab8038c085 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=4608": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..39faabbdede5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=49152": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..87cad481bbef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ec7d7b1e6d69 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5120": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..779014c6a48d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5504": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e4e1610914d4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5632": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..91e760ed29e7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6144": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d1dba65beeb5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6400": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..63cd02986d0e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=64000": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..53bd1e2a033d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=64256": { + "BLOCK_N": 256, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ceaea00d1ad4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6848": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f7fe5d732ec5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6912": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d026e12311a7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=7168": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1ce097bb563d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=8192": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f22a1513a6a4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=9216": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dc055ce5a023 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fec5aa8a43a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=10240": { + "BLOCK_N": 512, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f76e21dcf101 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=102400": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..02e01a88f229 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=102656": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ca13ab17631b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=11008": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..221b7046a42d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1152": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..04e2fee606e4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128": { + "BLOCK_N": 64, + "SPLIT_N": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f3e8e4c95080 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1280": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..92d8dc48ef21 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128000": { + "BLOCK_N": 512, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..deddd170e828 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128256": { + "BLOCK_N": 256, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d8dba1076582 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=13824": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..585c1d33ce0d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=14336": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3a2668c5d3fd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1536": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ac9189fdf242 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=15360": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9cc54aa3ceae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2048": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a49ce276482b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=22016": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b66638610a4a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2304": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6a6f0780ee68 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=24576": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..64cdc015d4f5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d768b5944d32 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e1af0bc7d4a2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=27392": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..de80b48b9e46 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2752": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2f1cc53d9d07 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=27648": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5399d322bca2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3072": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ebe63936e73e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32000": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0af75521d17 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32256": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ecb601665b16 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32512": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0a95531a6226 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32768": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e927a860d646 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=33024": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..64757e015c63 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3328": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..de0faf408e1f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3456": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5c93deb397ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3584": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3ce9a9150319 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=36864": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..82f6e893c6ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=4096": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e21f857efe75 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=43264": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b41625baab6a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=4608": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9ce03febee07 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=49152": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..efecf1f371f9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b1a5db564eba --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5120": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d105bc53b555 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5504": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e042d730fde0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5632": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..005e8480d530 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6144": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b0ce387d16a7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6400": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fe8d2347f8be --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=64000": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..19016d92afc8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=64256": { + "BLOCK_N": 1024, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c24e63d4b061 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6848": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..621e45f3d647 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6912": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1764f42a3690 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=7168": { + "BLOCK_N": 256, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2c005c34c3b5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=8192": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7a560bf5b977 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=9216": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0fb0abbfd93b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5089debbfefd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=10240": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..879a1d36c817 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=102400": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..eda15de61763 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=102656": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..10b2451c5f88 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=11008": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3ef065e97426 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1152": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..26ecc87645eb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cea687cffe03 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1280": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..50b07329f967 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128000": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c9bc76d4e02d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128256": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5e2bd08a4728 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=13824": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95c19b2bafe1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=14336": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2a3648ab54f4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1536": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d384ecbf3556 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=15360": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..017eb531ae10 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2048": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..841202546c13 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=22016": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..78e4c0323585 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2304": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1ac24aae29fe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=24576": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8422904ffb26 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8a0eb3dd15c3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0e433e6bed3d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=27392": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d6adf2ceb6b6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2752": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bca14db36270 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=27648": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..985d62438445 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3072": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e8464e7008ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32000": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a47ee1a7459e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32256": { + "BLOCK_N": 1024, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..889bbdc9e7c8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32512": { + "BLOCK_N": 256, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7f722c7cc4f2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32768": { + "BLOCK_N": 512, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ee0493b17a74 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=33024": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..552f181d0e3a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3328": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d2fb833b52b7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3456": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6fd72f2cb7df --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3584": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b566409ba7bc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=36864": { + "BLOCK_N": 512, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..02781cc3c3a3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=4096": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5eb93d119fd2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=43264": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..691bf7cf400d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=4608": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e6f417f1ce35 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=49152": { + "BLOCK_N": 64, + "SPLIT_N": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ecc4311c65c6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2f8e3bad9359 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5120": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..28bb43c5cea0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5504": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ffd213467c8b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5632": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..43dd6d5c4f34 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6144": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a73329612467 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6400": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1537688252ba --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=64000": { + "BLOCK_N": 512, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bdd7de7b0544 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=64256": { + "BLOCK_N": 512, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..537e8a289957 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6848": { + "BLOCK_N": 128, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5822a67015d7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6912": { + "BLOCK_N": 256, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..091e7c378078 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=7168": { + "BLOCK_N": 64, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0c8d9d533a7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=8192": { + "BLOCK_N": 128, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9b047851381d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=9216": { + "BLOCK_N": 64, + "SPLIT_N": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ba9a5daa8327 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..49bcf2f569f7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=10240": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6285daf17f71 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=102400": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bd9ce93f682f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=102656": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..939967371660 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=11008": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..798594bfd3a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1152": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3fa0c0edbdfe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..531e3e4accaa --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1280": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9d597ca3ab45 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128000": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7826d6de2043 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128256": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4382dfac1232 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=13824": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0dc25b7a9c47 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=14336": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2cb628ac30d9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1536": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d3ade6322fcf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=15360": { + "BLOCK_N": 256, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f743a190ff6c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2048": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..13dc549b58a4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=22016": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dd31e03333ef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2304": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f873a2168d70 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=24576": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f654b1763c6b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c3d6c38da9ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2560": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ebeee44d74f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=27392": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4583c1bd2a74 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2752": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..457cba2bb27a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=27648": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6c60fd9cf325 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3072": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..12dff65ef5e3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32000": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7e6bcdc82b12 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32256": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5cef4c0639e2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32512": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d475b36f7b10 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32768": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..62bd24b55325 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=33024": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b9d49f65f25c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3328": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f13c7ea9fcdb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3456": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3c8bb10faf54 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3584": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..eeeed1d55f4c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=36864": { + "BLOCK_N": 128, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c1780da9065d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=4096": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..73265ea43e99 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=43264": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95ef324ce999 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=4608": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a2861173e71e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=49152": { + "BLOCK_N": 256, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..03789328aa67 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2ecae2ab22ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5120": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7979c4049101 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5504": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e11fdff5cf8b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5632": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b5d4291484ac --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6144": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..510e5bcdd8f9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6400": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7d9938f211d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=64000": { + "BLOCK_N": 256, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4b4d5715c4a6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=64256": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bdc940e4306d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6848": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ac8e567768d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6912": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a5a8025c74e2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=7168": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4937a00c96b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=8192": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..14d37e0e84da --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=9216": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0655aeaf04d4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1024": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4da43af96a88 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=10240": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..02944f10112d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=102400": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..888cbee83cd0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=102656": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9b48040ae35d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=11008": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..17628098a876 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1152": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ba77dfd4e745 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128": { + "BLOCK_N": 32, + "SPLIT_N": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..afc038f82824 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1280": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cfca3795cf0c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128000": { + "BLOCK_N": 512, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..40205831e8c5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128256": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..73480c2a2fb0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=13824": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0ba68ff88dd7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=14336": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1411944ed903 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1536": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..52d4d6d866da --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=15360": { + "BLOCK_N": 1024, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5e8c8b03d807 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2048": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ea2e2b703621 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=22016": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e07a33a9f890 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2304": { + "BLOCK_N": 64, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1cd26dfc178c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=24576": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2101c81521e4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=256": { + "BLOCK_N": 32, + "SPLIT_N": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c6545c96c672 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2560": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dee776ee0b6b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=27392": { + "BLOCK_N": 256, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b34f648169cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2752": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b90111549674 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=27648": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a07aa0c23f3e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3072": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..70c8889ecf4f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32000": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9050b5588db6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32256": { + "BLOCK_N": 128, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..64a0a899f92c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32512": { + "BLOCK_N": 256, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3fc62e336640 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32768": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8bf15baf2d0e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=33024": { + "BLOCK_N": 128, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b72a219f413 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3328": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..555f04b52080 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3456": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f5e814a48de6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3584": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..aed09238da26 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=36864": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e32fcb1ffb63 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=4096": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..120a7ac01e61 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=43264": { + "BLOCK_N": 256, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3580fa47a62f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=4608": { + "BLOCK_N": 32, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5b7de710ac80 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=49152": { + "BLOCK_N": 256, + "SPLIT_N": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c25b01a3a2ee --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=512": { + "BLOCK_N": 32, + "SPLIT_N": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..39e20cdd3dc4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5120": { + "BLOCK_N": 32, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6103c0f82883 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5504": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..534348364229 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5632": { + "BLOCK_N": 64, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..29cdb7e2b43b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6144": { + "BLOCK_N": 128, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e5362a24e683 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6400": { + "BLOCK_N": 32, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6f5a25fef3b2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=64000": { + "BLOCK_N": 256, + "SPLIT_N": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5cd46fba6793 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=64256": { + "BLOCK_N": 64, + "SPLIT_N": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7afa26abf9ed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6848": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c2fbb625f0cb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6912": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..182c287a971c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=7168": { + "BLOCK_N": 64, + "SPLIT_N": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..543cfc5cf252 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=8192": { + "BLOCK_N": 128, + "SPLIT_N": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..209fd07020ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=9216": { + "BLOCK_N": 32, + "SPLIT_N": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e7807642e242 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d990a464aead --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=10240": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f8ba6f98e11d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=102400": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2820c3819fb1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=102656": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8ff38bafd95b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2239222c6721 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76ebd9a6187b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..604d0f4a24f6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7b9522e223c9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..16ca324abb1d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..51fc783ba5be --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=13824": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2b5c9fb008a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=14336": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1d6fdcc9e4cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0188952eabb5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=15360": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..748317facd1a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..762c6902553f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..50693dd027e9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ce7f797b6501 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1fb7817d045a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..93cb002287b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2560": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..df245f7e4e3a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c420bc28686a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ec8253156d69 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=27648": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8eb952a2f70b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5f4e26b927c4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6567393424e9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0de485aa91ed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..382626229f94 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=32768": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f339e51540b0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1c29335eaf49 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..164746c525a9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3456": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..32f4a2527a5f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bf9fccfd9628 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=36864": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5afaf9370a75 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=4096": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..434e65d6010f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5b05b7d563cb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=4608": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ed84d41e515 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..57f7208d5b31 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a39eee775813 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..386dcefdd9ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..404b2e3e7143 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=5632": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f0f167c7f637 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6144": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d5caa9380998 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..03c28033d268 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e5055c7016d7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=64256": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..64efb8751002 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6848": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..090a44e33153 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..47231de840f8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0b6943c5df2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bcfe34389c8e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=1,hidden_size=9216": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9f2c62aa7bf3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f205785ac354 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b1ae9af607c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..21f6c7c4d2c1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..80432fa22304 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=11008": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..431749d79e07 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=1152": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e4cf65f2b466 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=128": { + "BLOCK_K": 64, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..456cdd0cbfa7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..657d4601e38d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3fdc882f8d4d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f16d8e73b04f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cdb9f921d9e2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a4a2728ea3f7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e250ad59c685 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1fe78e780ab8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ea5d35272955 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c24d3ebad8f6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..60d16b61c097 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1ac220f8cdb4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..593b6236bedc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=2560": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e02f41487062 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0e21dad2a1bb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e7de2732db5d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4594326f1214 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=3072": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..476b94614e61 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3394e0b2b2e2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..47b57eaba2d9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2095dc93481f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=32768": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d7ae5539e292 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=33024": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..392502ed9bef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=3328": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..caaf6dd953b3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..34fa2b5fc43e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..71e95b2a3456 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1506da8d5b06 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=4096": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..82dd0f4c7d44 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f97d81fa1054 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=4608": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..985c8781d3a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7ca73c314f25 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1a63f1a720a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=5120": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..022bfd82a54b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=5504": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9b22d0042659 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4075a65a0a7c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=6144": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..eabfa5752d0a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=6400": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4c12481b8079 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=64000": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e30688c4abd2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2e29f14a92cd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..13ff36677840 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8539aa72ab90 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=7168": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..53b66c493ea1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a6dfe596884f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=104,hidden_size=9216": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..602ffc66510d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fafea8e3786d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=10240": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1eec6dc77df8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dedb0618406f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=102656": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..de1dcf78c395 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=11008": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..73bc95410c42 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c7834800f885 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..65868fe3b2f4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6c18618b318f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128000": { + "BLOCK_K": 256, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a036606de7cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..89df4c522fae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=13824": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a142ada1ad07 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=14336": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9bb33d368fef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3a988a776e6f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=15360": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a57d196ba0ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c6aa7bf7b99b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..83c7c4e4c9a2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..427c0106c6ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=24576": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d47ff5bbb529 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6f1afccc1299 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2560": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..61a1b944f16c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6423b56c688d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..aa5170de0248 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ae0f48e917f2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6897b5908a5b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4c40c7bfc193 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8b38bafedc79 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..81d3642fe4ec --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d5263b43cc15 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=33024": { + "BLOCK_K": 256, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..35d23d34e4a7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d72e173b284f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a6387b5fc49e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=3584": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d80742511305 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b27c32ff573f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=4096": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..af6c84968dea --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6da56b5f85f0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..aa2f1263faae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f3b5f24c791b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dde06af0de34 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2dc362e1e1d4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5504": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76853f696c19 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=5632": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d498238ea795 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6144": { + "BLOCK_K": 256, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fc10d7616de9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..60aed7116c3d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..98bccfc678a1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..71bf22fc018b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0cbc42cfc86f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=6912": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ebab6f621840 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=7168": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7f2582d5fabb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=8192": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..37a66a2651be --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=16,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cb914f7d052d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3cbfe81a8241 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..01318faf6258 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=102400": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7201e38109c1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d36bf9a65f9e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=11008": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..01a390c790a7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..068bf8ed0f4e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7172561ad91a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c39444441044 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0fe8be7dd9e4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=128256": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..463a4228b8b6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1c566ee67de8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=14336": { + "BLOCK_K": 32, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bd612e59861b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c6a18389e22b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..de611d52747f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7692a7d496a3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..67533819f2b7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e7df88ce3d48 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=24576": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d60a540f6c9f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d4f9c110fc67 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2560": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..db4554d192bc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=27392": { + "BLOCK_K": 32, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e468805dd6bf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ffbb85f3cc9d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..77046010f9f9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..101a1fde2e17 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97ecc56f5a4b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f1f3fa79859c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f26ad0121efa --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..08f2719e2a70 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..19650dbe5c76 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..43b03a77fccf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3456": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5ea0fd9b7fb1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97e4bf17acb5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=36864": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b201e52f3f1a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=4096": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d79ac16a16e0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9d1bc041bfa0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=4608": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..af3fbc6aa009 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=49152": { + "BLOCK_K": 32, + "SPLIT_K": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3e6ab8e331e8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3824f7dc6657 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5120": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d1fd2e96658a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5504": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d9642bf1e5bf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=5632": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..89e5b64ce96b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6144": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2345b7e5e882 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6de6f1c12486 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b6c138a55f28 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4a63ed737b7a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ee7b427fbae6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..345cb9192105 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=7168": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..91dabeb15527 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b90e9970c8c0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=2,hidden_size=9216": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d750dbc94ebf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..422be89522eb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=10240": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a43fa1f140e9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..89e3dac0a31e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8c75da6742a4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=11008": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..81d9eeae3f90 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..32c16964bada --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cd66a4abdfdd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6e95d75f0b7a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ffa41368045 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=128256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0c993ab8ef41 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=13824": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4bef43ff2a51 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..382f702b15f5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1b6fb8d0262c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9f9ab7d1da6f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2ca6015cc8e2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c01c77b849ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1ffc10aab8a9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=24576": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9c90c8e5d3d6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a4a5aff5f9bf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2560": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bfad4944a155 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=27392": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f377b9487847 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=2752": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..294a45493519 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5dd926e91002 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7b41f769bfea --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2f376b9d3d99 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5a03b6ba2242 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a093a839dd4e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..904257e85be9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4f41954d2c44 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3328": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7b430bb3ef84 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5545e8a9bb58 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=3584": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76e30d5c6721 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=36864": { + "BLOCK_K": 256, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..60adb9b594de --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=4096": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..70fac02be5f4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fac51807dd14 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f8af00a3f0b2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=49152": { + "BLOCK_K": 256, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ec0716115b72 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5bfce900dd59 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..142a66db84ff --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1e819ccf88b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=5632": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..640dfe5c4aab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3f9f49c856ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ee67fafb3f47 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a4c7d1951a2d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=64256": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2668afddbf9d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..432191c74e6e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..965794c29584 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4099c8b31019 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..54f120427642 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=24,hidden_size=9216": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bc0d77a48b76 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4d245dd8606c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=10240": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..15f2726ffb7f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=102400": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..610e795d0889 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..86f0bd139844 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=11008": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..525db2cd9c83 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f154c1ac63ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4d9ca99f621b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4ce6fea19d94 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..61b5a2e19b2a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=128256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ee27b7eb7145 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=13824": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f1c0786c18a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2e80cd18b7e3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7ebd2698a89 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=15360": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a70cbb63546d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..411a6d693f37 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7bf4485bdafb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4b83542ed831 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fcdf01c82b6e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..27fa6e4ab908 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2560": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f32c6f35e55b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97e139922a3b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..633e173ede3d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=27648": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4b2853e27e6d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3072": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dbca49ebf47f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bbbb809b4bac --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32256": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2db20b0f523a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b76c39dbc79c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=32768": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95de274c392b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..31cc0f0988df --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3328": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d5848f9c0dc2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..928c4793a1b4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=3584": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..87333a2977e6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a13f1ae708cc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=4096": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e263c22b48c8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..28ccb7922928 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..74bdc063f829 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=49152": { + "BLOCK_K": 32, + "SPLIT_K": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e65f07e0c1b7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..86b22822e193 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..068d658420a2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bb22b22c0cf4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=5632": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..693e8b466480 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6144": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..72e213c9c841 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8983a00a5dc0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7ebc1a433047 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=64256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..baf8710e7904 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6848": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..83d82eab352e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=6912": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ea365d60b5f7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e0e266cba149 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..df672f000e51 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=32,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..21b7f37b6f10 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1024": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6780bd1b6061 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c8d5d48f3067 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8092de1d055e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=102656": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..25352c1f541a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=11008": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..72a9af0df017 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1152": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8b8a4d4ba0a6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5a63ccf5ea27 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1280": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f5322f696374 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128000": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9c0b7751d2a2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3645b8d9bc6e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f8e1776dbeac --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..67a3fb9e41c7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d145bf3a8058 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=15360": { + "BLOCK_K": 256, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..17bbd1c727ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2048": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..50bb33d9ddc2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=22016": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..28de03a38564 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2304": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e8696121d871 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f24f53814e53 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..414e8136350b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2560": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0a7f1ad50ca --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=27392": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90b14d89c238 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dfb4a2036591 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=27648": { + "BLOCK_K": 32, + "SPLIT_K": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..afcc971891fd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..86627bb4c2b6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 256, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..18f2e8d01075 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..60216da8b12c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3f3d7c814ec2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=32768": { + "BLOCK_K": 32, + "SPLIT_K": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..11f41c734aed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=33024": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b7b78e643da8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dd301c41083a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3456": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..00e16062556b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90b71d6a48aa --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4567fabf9fa3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=4096": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d758bd2bd5d4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4dd89a2baa91 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=4608": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4e2ea6dc099c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..571e5f11fb3a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..72fbed4df169 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cdebe9fa2d4d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5504": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b3315ac27857 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0f58f063add0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..36435a35bd1f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a3d53ddfb20c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9ec3fad337d0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..48399c1da49f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90a1a1dc7123 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..490f4fe0958a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=7168": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7dfbd884de1e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..423a0a8bb660 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=4,hidden_size=9216": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a23caa15d938 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=1024": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..da975d0f7f04 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..765e06971801 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ad02b596b368 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=102656": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ea35e6b4067 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=11008": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3c5b43290284 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d1d0c2952eec --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7ac3b09650e6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7e364a12309c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..362a4794b89d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6daf2db30092 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d39676778929 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ca1458693a59 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5caff317b920 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=15360": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bd333dbdea1b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5b17698cf941 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8c34a9ffe746 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..628d5e31f820 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=24576": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..53510a4a3176 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..36e4a63d0806 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=2560": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e95f7c421e3c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=27392": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e9fde38aa4c1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..675598e6916c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=27648": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0c7c62a69f4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d38a57c5cb63 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=32000": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fbf651ee7ab1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f56745eaf5d9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c7e6acf8da4c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=32768": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e214f20d25dd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=33024": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c3a7b1b3d075 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..70c8b1a5cebe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7518d2514d3f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=3584": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8b8caf903283 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..24dfdeb31e27 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=4096": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..73974a4b5ec4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=43264": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..134f097ee092 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=4608": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..99fd65e94e97 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 256, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a085010d32df --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..50fa49ac53a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3444417e1cc2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=5504": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..15f14f8a9a1d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1c05dce95c3a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..56833977b005 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bfa3f47fa2eb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4991337f39f4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ad9494f2d0ef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..72ffcbf1889d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=6912": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c5ae6958ebdf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=7168": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bdd5edbfc87b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..610d6175f27e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=40,hidden_size=9216": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2548ac5d500e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..987dc9b61dc2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..848a752c0379 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=102400": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..faa00296ce21 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f280fe287d2b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..19ddc1d36abf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7f53ede4c6d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d3c33217c882 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a8fb320c7b01 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=128000": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e0a7c154a14f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=128256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c9b76cf2ea95 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=13824": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1fcf12b7eb96 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c335578c3dab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b9f810c13912 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=15360": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..215df482ca64 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..86e68481e14c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ee1aa6e358cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=2304": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f00777cd606a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0382b4bfcad6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0cd5ac59ca34 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=2560": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3e5825452ae7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=27392": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e9ebb39aaa52 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=2752": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0951d6150b20 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4f6ec8b3fcfc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0dc115f3588e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..55b7954dd82a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..786022725aab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a80125409b55 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c2c745a5b6ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=33024": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e8803b5163ed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=3328": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d3ee96ee2125 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=3456": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0ba06679888a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..db6774f5f21c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7ffc7684eaf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=4096": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6f775a7eaa65 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..29275d15516d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c1f3159e79ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=49152": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..43d2c867eb78 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3ef158578c25 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2fa3f9b2dd5a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=5504": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b7df5022eed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5150a9505d59 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=6144": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..15bd83f652b2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=6400": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..be3441337184 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5adbe1354608 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4d0c8340f94e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=6848": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f25843901a29 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e7f8fdd5c289 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=7168": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1c620ed90717 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ca7f79c797da --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=48,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b18ad9ef740f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7e5e16254b14 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..66b9016b80d6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..41267d76bb7b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8b1bdd081b38 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=11008": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1b86365ec8d8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a3605ec25c3e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..177e169046c1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..eefe22a77acf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=128000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6d4866d51277 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d25a6e558ef0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c13f346b0444 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..30fefd4cff6f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d70b173c3ae9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9686ba58b423 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ded89a74da24 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=22016": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1bf7575e2a95 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ff3b1a65cd88 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=24576": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..32b1dc19f9a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5a103600df73 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=2560": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..862580187e9b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..765b7fcf0597 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dad3be5ede3c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=27648": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5e9de4977a0d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..12c92e427c5d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cece35827652 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=32256": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..05a07685f4f8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..19b832c59326 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..13256a11de49 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=33024": { + "BLOCK_K": 256, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..71ca4b92a1c6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7a02f9a5753f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=3456": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..36ce27a20c7c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=3584": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e473965d0d89 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..aa7ad780b5ed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=4096": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d09de854e154 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c600f9a4a564 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=4608": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f2fcd1b21602 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a0f8e37807d3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9fbe3a6da66c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=5120": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..60d5c4dc40be --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=5504": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a36eb605fa4c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ad7b11ec6f27 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..528a7abd6ffd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5e1779ffba43 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c89b534e12a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=64256": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7943d0cfde5e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..06705f2aa342 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..21fcc4c55de4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=7168": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c7b2c16677ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..03d554a6d65e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=56,hidden_size=9216": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d0938a7af883 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..07bcdbf20094 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a3b3e1a40bd6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..64d9dace43e6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=102656": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5dfc981bc2f6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bbefa2f6fdc6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=1152": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c6f10cf4989f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4663289ea195 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=1280": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4f50abb8852a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a3cc61173170 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6a6d73ea7f2a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=13824": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7ed41fea026e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..230edf27c64b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cdd186cc4b9f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4f76011c145a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7798c41d0f06 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..49d5d5dd3ac6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9f700c6130b4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=24576": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4cc76fa38c81 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76ee8a7d6102 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=2560": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..71b40355f9d8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=27392": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7cdad6a26049 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a17c9772d2bb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=27648": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a6d8b40ac252 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=3072": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bc0b651ecd4c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c9c7dc3f4468 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6892c863631b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e2f661dda26c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7fa15ee16716 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=33024": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..95868bcb7456 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4413b0d3675b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=3456": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5899404b9634 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=3584": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d89c80fb30b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..76c2211485c3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=4096": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b6f398c0b076 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..acec64f55cd6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=4608": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7163d4ab8c39 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0925d222a787 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..97ea276d1953 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d8383bc693fe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=5504": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7922df8c0829 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=5632": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fa5851fd7502 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=6144": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bd05c1814c89 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..797f53636838 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3a1619a239c1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=64256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e953b5c67710 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..423d7b24f01d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7f10bd0d49f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..435dadeca1af --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f346f15c2e23 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=64,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0c893f180a10 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1e4afde2e512 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..978cd7ab7325 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c7bdb214bd4c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=102656": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6d6f21405c65 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=11008": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..80cf30cfef96 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f8103e769b18 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=128": { + "BLOCK_K": 64, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c3d44a73f35b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=1280": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cae90f69a184 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bba9830ad8b8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3107dffe8924 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a0e14bab1eaf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5373e616c435 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f891ab67fecf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=15360": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b0a9bcbf635d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=2048": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..400122f387f5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6b0e09332d14 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=2304": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7c718f80e1b4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..109cc3a80846 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..66199975e837 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=2560": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e8654beb8e5d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=27392": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7f2de73261ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5d07e0d228b6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=27648": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..08c31b8980d1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=3072": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3b025579accd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..70b96ee453df --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..844d7ed28415 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..11201ec67bde --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e2ced87f84be --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=33024": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..00c493a9d303 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=3328": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..463c4846743a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=3456": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c9798336f74f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e6e44d69d91b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=36864": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bd730acc7051 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=4096": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e13c0d95b195 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f1a4e393519c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2ef93f449226 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0004a05e8a1e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dc50b17d249d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=5120": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..bf12510a5425 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2ac3724620af --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=5632": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e4ab9dc8b8ba --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8383761bc837 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=6400": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..097ee2ec6574 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6ec204c9c111 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=64256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..847e461c0323 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e46f913737b3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..754a87f1bbe8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=7168": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..83cb91d60ca7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7081cf4076bb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=72,hidden_size=9216": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..247e28d35bd5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ea4f2a102c02 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=10240": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8ec9a4f020e6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d7e734b9dbfe --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=102656": { + "BLOCK_K": 512, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..daf1cb8ae2b0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 128, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..002abc7662a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..92c8b78468f9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d901fce7e335 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..093e42b6513f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4443e8cbbf33 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fc23807fa76e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ddbdd089d658 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=14336": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e35893b40242 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=1536": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..37acdda7a634 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=15360": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ab9441a7b3ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..59c73ba026de --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=22016": { + "BLOCK_K": 256, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..002f8b5c8968 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4ae4d6f4734c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4e9bf0aba106 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1d191860b79c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2560": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..23761e39d7ad --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=27392": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cc645e813758 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=2752": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ae6226d1cb21 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=27648": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..88139aaaf02c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3072": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3f05658cfb57 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4fbf55d8bb05 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32256": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4064cd359317 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32512": { + "BLOCK_K": 256, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ce19b767a77d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d96efbb58943 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 16, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8f68f4280c29 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3328": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ec977c3530ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3456": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7aa5634fcd48 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=3584": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b571892198ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=36864": { + "BLOCK_K": 32, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7bb972163a11 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=4096": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ae17c8ecf5e6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d1f32242519d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=4608": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2a5dd4740ffb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=49152": { + "BLOCK_K": 256, + "SPLIT_K": 64, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dbf21f5fa1e0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e920344f2420 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5120": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..79940e1927b2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5504": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8a35fdbbafd7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c6e6a52180d3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e9d33ae6f038 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6400": { + "BLOCK_K": 32, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..be5e4ab7d032 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 64, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..711407b0620f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4f73d54aa992 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6848": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8c21ae9405a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d5e3f555a677 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..68faf2a604da --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=8192": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..29f03a383aca --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=8,hidden_size=9216": { + "BLOCK_K": 128, + "SPLIT_K": 32, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3acf2172ddb3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..643a627ea0d4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=10240": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d99bce723687 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..877a33b65222 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..12ade6916fcb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4dc0e71441f8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f21a68e8ee83 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5152f0b3ff4e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3953082729b0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=128000": { + "BLOCK_K": 256, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7524f10fca70 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=128256": { + "BLOCK_K": 256, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2064e9bd9b5b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f5348113634c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=14336": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0dfe95f6c31b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=1536": { + "BLOCK_K": 128, + "SPLIT_K": 8, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6d25ff48d801 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=15360": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6f86b7098d3e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=2048": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ac0d8fe29ee7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=22016": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..eebb376f205b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=2304": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1e45d954518c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..702d10096436 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e89e84d4deed --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=2560": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..78683762005e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=27392": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0da685e67d6d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=2752": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..53017d8ee495 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7c05ddd4194c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ebd7f65eac4a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3541f0b9a3f0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..20d55b196608 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=32512": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..96210fec220f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..125b434ef45b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b04fbbdda9de --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=3328": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ce1b418a958c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=3456": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ea1c5a006c8f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..40ec1fab7bba --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=36864": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4c361c17fe59 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=4096": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0dd7c3d37d70 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=43264": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8723dad79e62 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9415037ed7f7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=49152": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..13eb0b2756ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..dc583685ab02 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=5120": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..23fb68ff8153 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b085e5316888 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=5632": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..812297caf2a0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=6144": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..54d92a85b1f6 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..405c2e868728 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=64000": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c66a7e5f0ccf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6747ed1a08c7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=6848": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..31651fc1faa2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ed2fa7ce6d18 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b5546e695dc9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=8192": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7c44a9ae81b9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=80,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f39f6414abd3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=1024": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..841902cc2ff0 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..f7aa2b2768cb --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=102400": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..07e42e00e844 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=102656": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..830764450db5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=11008": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3a5efc527c8b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=1152": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..becb0e603976 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=128": { + "BLOCK_K": 64, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b44b8c3d8180 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=1280": { + "BLOCK_K": 32, + "SPLIT_K": 16, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ddfcfb01c7b7 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6719e38fba98 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..0f52b8c9cc0f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=13824": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9cc14f02017d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=14336": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c50b3242921a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7e98f105086a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=15360": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..aa6bee2870fc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=2048": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..21096640cb13 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a50aa2e0363c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c3364686564e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b7ff93819113 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8d10bf69031c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=2560": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..096b8d320b72 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=27392": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..684d38d2811c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2bd01194d5ce --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a7d00956f02c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..886d8ea5f4ab --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=32000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d8e6b13dbe92 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c62e742fa961 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e1d739d0a49d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=32768": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..601d128de45a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=33024": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5dd67e2690f9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b7d9fe07cfb5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=3456": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ca7f97699ec8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..005f4af2dd66 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=36864": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b97db9b80ccc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=4096": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fab5f7de4715 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=43264": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..78ca804fa3d1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=4608": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..524c1118598f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=49152": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..387b87a065a8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..34ce46ce03d2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=5120": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..148080894721 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=5504": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c75811679466 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=5632": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fdc0c3cfa0dd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=6144": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c86867594102 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8e4ca4b8d8a1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=64000": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..206e6e2d37e4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ab33b78848ec --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=6848": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e06af8cd8cfd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=6912": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..b775ea143b36 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=7168": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..9300b4bdb8f1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3c604544d052 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=88,hidden_size=9216": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..068b851caa7f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=1024": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..378704ab28cf --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=10240": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..90174392ce3e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=102400": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..112850a0b030 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=102656": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..77f512f41fb9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=11008": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..1f3f1b604a4d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=1152": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3124fa86ce0a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=128": { + "BLOCK_K": 128, + "SPLIT_K": 1, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fea9bf5bd3a5 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=1280": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..df330fd200a9 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=128000": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3753f062cd04 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=128256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5aee58b4062e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=13824": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2db120babc1f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=14336": { + "BLOCK_K": 64, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..11a26c11166c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=1536": { + "BLOCK_K": 64, + "SPLIT_K": 32, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..100067e93785 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=15360": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5d9db82e5288 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=2048": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..abc9c95e8b22 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=22016": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..66be2017f0ae --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=2304": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..45873b198e73 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=24576": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..ff32d7268f4e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=256": { + "BLOCK_K": 32, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c404185f47ef --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=2560": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..45dc37cd6c1e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=27392": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..354f9cda513e --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=2752": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3ce20813a940 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=27648": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..fcec90b796f8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=3072": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..06035f8733b3 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=32000": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c5c04329190b --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=32256": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..89bdb176ebc2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=32512": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..faab7a47840a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=32768": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..5b56d69c403f --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=33024": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..4b0e1c5badfa --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=3328": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..600943e1897d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=3456": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6690b75db842 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=3584": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..e584bc28dd7d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=36864": { + "BLOCK_K": 256, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..7b09b5d1a65c --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=4096": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..98d8cebcbf73 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=43264": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c63bbc7f882d --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=4608": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a36a9b36aa45 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=49152": { + "BLOCK_K": 64, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..d24898f0d4c4 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=512": { + "BLOCK_K": 32, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..56f8409d1aee --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=5120": { + "BLOCK_K": 32, + "SPLIT_K": 8, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..589965340a56 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=5504": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..cf5e15814824 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=5632": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..2034cf5ea634 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=6144": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..a2fb7b122395 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=6400": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..3128d88fe9bc --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=64000": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..751604b796cd --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=64256": { + "BLOCK_K": 128, + "SPLIT_K": 2, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..c5d3fc706dc8 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=6848": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..26748fd2b2b1 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=6912": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..8b0abb8d3cb2 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=7168": { + "BLOCK_K": 128, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..46756b2b589a --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=8192": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 8 + } +} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json new file mode 100644 index 000000000000..6dd33a999d46 --- /dev/null +++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json @@ -0,0 +1,7 @@ +{ + "batchs=96,hidden_size=9216": { + "BLOCK_K": 64, + "SPLIT_K": 4, + "num_warps": 4 + } +} diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index fb8110b90564..f4e71cb110bb 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -35,7 +35,7 @@ def _get_op_configs( f"batchs={batch},hidden_size={hidden_size}", None ) return tuned_config - + # If no optimized configuration is available, return None return None @@ -45,7 +45,6 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int): return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8} else: return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8} - # raise NotImplementedError def get_lora_op_configs( From e7bda61b8181ef3dec0f123fa07d7ae92aeef639 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 13 Jun 2024 14:40:26 +0800 Subject: [PATCH 21/71] delete config --- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 ------- 1150 files changed, 8050 deletions(-) delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b39ab85d9b3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 926c453330ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=10240": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3403b6a8a156..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=102400": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 00a40f0fb282..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=102656": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fe2ef151f545..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=11008": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e72812a699b8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1152": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6af79154d137..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ede75bf5ee4a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1280": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7c18b5d9e89a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128000": { - "BLOCK_N": 256, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 74123059d34c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128256": { - "BLOCK_N": 256, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e6204367ba9f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=13824": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 619d49755fbd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=14336": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index da168958d44b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1536": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b774e5e73509..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=15360": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5df0d12a0066..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2048": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e669eec80db8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=22016": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6f248613276a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2304": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4b800fceca15..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=24576": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97ef5bd49850..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 498985dfa565..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 888779c1a242..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=27392": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90c40f66516a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2752": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 725987ef135d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=27648": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6758c49d6d53..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3072": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 739073148751..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32000": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3ba9089734f5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32256": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 74d73ee28866..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32512": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cd011852520e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32768": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bdb74bae1096..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=33024": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90067aae86ea..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3328": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 266f0b4643a2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3456": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 50eef61c7dc0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3584": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1a0e6bad928a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=36864": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6379489182c5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=4096": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 845e90469c7a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=43264": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6da3f2cdd17e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=4608": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5859f692c3b2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=49152": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a1ed01126386..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 20e62377ef27..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5120": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cfd9a3f149ff..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5504": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7e7ff82dc5f3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5632": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6312c21225d5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6144": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7bda71f1c3e4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6400": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c39485cbc08c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=64000": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b1adfbe01c2c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=64256": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0b1aee061aa7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6848": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3a894b412e3f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6912": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0132c4375421..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=7168": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9ffe008aa83e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=8192": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 838189dba35d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=9216": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5b1da4d44b94..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1024": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c392909217f5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=10240": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2e160c4ae390..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=102400": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 46428cc0a9da..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=102656": { - "BLOCK_N": 512, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 34ff5ebb9fe7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=11008": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1375324c09ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1152": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e8b0e9dbe8b0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 72fc4afd1efe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1280": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97e7d9e7bd0d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128000": { - "BLOCK_N": 1024, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 40a4a9526be0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128256": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ac35eea6297a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=13824": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 38b1819b0120..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=14336": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 668669e9fb4a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1536": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c23e4b555ab4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=15360": { - "BLOCK_N": 512, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 30715168cdd8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2048": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 050f3384e1cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=22016": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3820959d0032..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2304": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 08d8f70e1e7a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=24576": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5cf06550f0b1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5ee401212495..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1d7db0c6a860..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=27392": { - "BLOCK_N": 512, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 77fc2358208c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2752": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b33817c6ecb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=27648": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d62a622342b5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3072": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c7030ad5a673..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32000": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ff76f3c110b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32256": { - "BLOCK_N": 512, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 721b587a948d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32512": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 92932b62f1a1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32768": { - "BLOCK_N": 256, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7cf1394d96bd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=33024": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bec43f2e9cd1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3328": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8dfd12024faa..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3456": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c9fa0757f4d2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3584": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2323a50dfb84..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=36864": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 41e170807720..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=4096": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b04da877902c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=43264": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0df3ef025f97..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=4608": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 07e41596ed86..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=49152": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9013302be01a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 520a85f2e70a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5120": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 11a3940a9d4a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5504": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 43f4baa91a71..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5632": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2a5260ec1d4d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6144": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 962399539ec2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6400": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cbc8f93ce329..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=64000": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95e76f479321..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=64256": { - "BLOCK_N": 512, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b967d91645ed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6848": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6c2f971176df..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6912": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e0203c01009e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=7168": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0db797564e0d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=8192": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ab2faa8a3e47..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=9216": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 99d36f108d24..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5d1797c7df6a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=10240": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3c5a379e0bdc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=102400": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 16f2497bed72..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=102656": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index de9477263adf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=11008": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 58b67d1eb450..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1152": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b20b74c6b65..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 711133cabf41..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1280": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8ae36e752fa0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128000": { - "BLOCK_N": 1024, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9bd2e640b741..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128256": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e9ef08e28930..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=13824": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 02df6114edd3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=14336": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f174cccf6781..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1536": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95398b063d5a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=15360": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a9eb1222067d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2048": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b0376ebedcf0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=22016": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 33a78ee55501..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2304": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5d62f3bc77ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=24576": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bb04a656ec58..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76589c2e4848..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c4f0e1fe7d02..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=27392": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4e3e51669b1a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2752": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76507320d8c9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=27648": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95aefbac204c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3072": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 804c9a7df946..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32000": { - "BLOCK_N": 512, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8c4069b4c997..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32256": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0de6a013f40e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32512": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fbc4f954e962..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32768": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 89dff508fcff..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=33024": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fb48ebaa8f79..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3328": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7603143e1ff2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3456": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f119d16aa81c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3584": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b19e8f096df7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=36864": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e870cd0967a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=4096": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b30bd7d45e40..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=43264": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 29ab8038c085..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=4608": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 39faabbdede5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=49152": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 87cad481bbef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ec7d7b1e6d69..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5120": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 779014c6a48d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5504": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e4e1610914d4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5632": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 91e760ed29e7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6144": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d1dba65beeb5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6400": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 63cd02986d0e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=64000": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 53bd1e2a033d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=64256": { - "BLOCK_N": 256, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ceaea00d1ad4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6848": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f7fe5d732ec5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6912": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d026e12311a7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=7168": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1ce097bb563d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=8192": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f22a1513a6a4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=9216": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dc055ce5a023..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fec5aa8a43a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=10240": { - "BLOCK_N": 512, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f76e21dcf101..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=102400": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 02e01a88f229..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=102656": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ca13ab17631b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=11008": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 221b7046a42d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1152": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 04e2fee606e4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128": { - "BLOCK_N": 64, - "SPLIT_N": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f3e8e4c95080..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1280": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 92d8dc48ef21..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128000": { - "BLOCK_N": 512, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index deddd170e828..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128256": { - "BLOCK_N": 256, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d8dba1076582..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=13824": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 585c1d33ce0d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=14336": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3a2668c5d3fd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1536": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ac9189fdf242..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=15360": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9cc54aa3ceae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2048": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a49ce276482b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=22016": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b66638610a4a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2304": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6a6f0780ee68..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=24576": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 64cdc015d4f5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d768b5944d32..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e1af0bc7d4a2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=27392": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index de80b48b9e46..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2752": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2f1cc53d9d07..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=27648": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5399d322bca2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3072": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ebe63936e73e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32000": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0af75521d17..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32256": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ecb601665b16..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32512": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0a95531a6226..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32768": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e927a860d646..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=33024": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 64757e015c63..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3328": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index de0faf408e1f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3456": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5c93deb397ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3584": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3ce9a9150319..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=36864": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 82f6e893c6ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=4096": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e21f857efe75..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=43264": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b41625baab6a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=4608": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9ce03febee07..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=49152": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index efecf1f371f9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b1a5db564eba..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5120": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d105bc53b555..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5504": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e042d730fde0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5632": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 005e8480d530..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6144": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b0ce387d16a7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6400": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fe8d2347f8be..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=64000": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 19016d92afc8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=64256": { - "BLOCK_N": 1024, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c24e63d4b061..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6848": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 621e45f3d647..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6912": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1764f42a3690..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=7168": { - "BLOCK_N": 256, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2c005c34c3b5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=8192": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7a560bf5b977..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=9216": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0fb0abbfd93b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5089debbfefd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=10240": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 879a1d36c817..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=102400": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index eda15de61763..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=102656": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 10b2451c5f88..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=11008": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3ef065e97426..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1152": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 26ecc87645eb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cea687cffe03..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1280": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 50b07329f967..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128000": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c9bc76d4e02d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128256": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5e2bd08a4728..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=13824": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95c19b2bafe1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=14336": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2a3648ab54f4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1536": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d384ecbf3556..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=15360": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 017eb531ae10..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2048": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 841202546c13..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=22016": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 78e4c0323585..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2304": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1ac24aae29fe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=24576": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8422904ffb26..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8a0eb3dd15c3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0e433e6bed3d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=27392": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d6adf2ceb6b6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2752": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bca14db36270..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=27648": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 985d62438445..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3072": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e8464e7008ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32000": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a47ee1a7459e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32256": { - "BLOCK_N": 1024, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 889bbdc9e7c8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32512": { - "BLOCK_N": 256, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7f722c7cc4f2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32768": { - "BLOCK_N": 512, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ee0493b17a74..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=33024": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 552f181d0e3a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3328": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d2fb833b52b7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3456": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6fd72f2cb7df..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3584": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b566409ba7bc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=36864": { - "BLOCK_N": 512, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 02781cc3c3a3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=4096": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5eb93d119fd2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=43264": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 691bf7cf400d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=4608": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e6f417f1ce35..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=49152": { - "BLOCK_N": 64, - "SPLIT_N": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ecc4311c65c6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2f8e3bad9359..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5120": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 28bb43c5cea0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5504": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ffd213467c8b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5632": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 43dd6d5c4f34..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6144": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a73329612467..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6400": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1537688252ba..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=64000": { - "BLOCK_N": 512, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bdd7de7b0544..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=64256": { - "BLOCK_N": 512, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 537e8a289957..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6848": { - "BLOCK_N": 128, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5822a67015d7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6912": { - "BLOCK_N": 256, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 091e7c378078..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=7168": { - "BLOCK_N": 64, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0c8d9d533a7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=8192": { - "BLOCK_N": 128, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9b047851381d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=9216": { - "BLOCK_N": 64, - "SPLIT_N": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ba9a5daa8327..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 49bcf2f569f7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=10240": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6285daf17f71..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=102400": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bd9ce93f682f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=102656": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 939967371660..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=11008": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 798594bfd3a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1152": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3fa0c0edbdfe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 531e3e4accaa..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1280": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9d597ca3ab45..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128000": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7826d6de2043..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128256": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4382dfac1232..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=13824": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0dc25b7a9c47..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=14336": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2cb628ac30d9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1536": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d3ade6322fcf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=15360": { - "BLOCK_N": 256, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f743a190ff6c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2048": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 13dc549b58a4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=22016": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dd31e03333ef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2304": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f873a2168d70..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=24576": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f654b1763c6b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c3d6c38da9ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2560": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ebeee44d74f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=27392": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4583c1bd2a74..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2752": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 457cba2bb27a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=27648": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6c60fd9cf325..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3072": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 12dff65ef5e3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32000": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7e6bcdc82b12..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32256": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5cef4c0639e2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32512": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d475b36f7b10..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32768": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 62bd24b55325..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=33024": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b9d49f65f25c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3328": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f13c7ea9fcdb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3456": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3c8bb10faf54..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3584": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index eeeed1d55f4c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=36864": { - "BLOCK_N": 128, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c1780da9065d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=4096": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 73265ea43e99..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=43264": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95ef324ce999..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=4608": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a2861173e71e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=49152": { - "BLOCK_N": 256, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 03789328aa67..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2ecae2ab22ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5120": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7979c4049101..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5504": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e11fdff5cf8b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5632": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b5d4291484ac..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6144": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 510e5bcdd8f9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6400": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7d9938f211d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=64000": { - "BLOCK_N": 256, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4b4d5715c4a6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=64256": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bdc940e4306d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6848": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ac8e567768d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6912": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a5a8025c74e2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=7168": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4937a00c96b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=8192": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 14d37e0e84da..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=9216": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0655aeaf04d4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1024": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4da43af96a88..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=10240": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 02944f10112d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=102400": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 888cbee83cd0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=102656": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9b48040ae35d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=11008": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 17628098a876..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1152": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ba77dfd4e745..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128": { - "BLOCK_N": 32, - "SPLIT_N": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index afc038f82824..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1280": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cfca3795cf0c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128000": { - "BLOCK_N": 512, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 40205831e8c5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128256": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 73480c2a2fb0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=13824": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0ba68ff88dd7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=14336": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1411944ed903..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1536": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 52d4d6d866da..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=15360": { - "BLOCK_N": 1024, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5e8c8b03d807..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2048": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ea2e2b703621..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=22016": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e07a33a9f890..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2304": { - "BLOCK_N": 64, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1cd26dfc178c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=24576": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2101c81521e4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=256": { - "BLOCK_N": 32, - "SPLIT_N": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c6545c96c672..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2560": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dee776ee0b6b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=27392": { - "BLOCK_N": 256, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b34f648169cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2752": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b90111549674..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=27648": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a07aa0c23f3e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3072": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 70c8889ecf4f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32000": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9050b5588db6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32256": { - "BLOCK_N": 128, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 64a0a899f92c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32512": { - "BLOCK_N": 256, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3fc62e336640..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32768": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8bf15baf2d0e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=33024": { - "BLOCK_N": 128, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b72a219f413..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3328": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 555f04b52080..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3456": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f5e814a48de6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3584": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index aed09238da26..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=36864": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e32fcb1ffb63..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=4096": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 120a7ac01e61..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=43264": { - "BLOCK_N": 256, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3580fa47a62f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=4608": { - "BLOCK_N": 32, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5b7de710ac80..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=49152": { - "BLOCK_N": 256, - "SPLIT_N": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c25b01a3a2ee..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=512": { - "BLOCK_N": 32, - "SPLIT_N": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 39e20cdd3dc4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5120": { - "BLOCK_N": 32, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6103c0f82883..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5504": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 534348364229..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5632": { - "BLOCK_N": 64, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 29cdb7e2b43b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6144": { - "BLOCK_N": 128, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e5362a24e683..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6400": { - "BLOCK_N": 32, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6f5a25fef3b2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=64000": { - "BLOCK_N": 256, - "SPLIT_N": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5cd46fba6793..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=64256": { - "BLOCK_N": 64, - "SPLIT_N": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7afa26abf9ed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6848": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c2fbb625f0cb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6912": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 182c287a971c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=7168": { - "BLOCK_N": 64, - "SPLIT_N": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 543cfc5cf252..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=8192": { - "BLOCK_N": 128, - "SPLIT_N": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 209fd07020ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=9216": { - "BLOCK_N": 32, - "SPLIT_N": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e7807642e242..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d990a464aead..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=10240": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f8ba6f98e11d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=102400": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2820c3819fb1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=102656": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8ff38bafd95b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2239222c6721..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76ebd9a6187b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 604d0f4a24f6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7b9522e223c9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 16ca324abb1d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 51fc783ba5be..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=13824": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2b5c9fb008a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=14336": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1d6fdcc9e4cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0188952eabb5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=15360": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 748317facd1a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 762c6902553f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 50693dd027e9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ce7f797b6501..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1fb7817d045a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 93cb002287b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2560": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index df245f7e4e3a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c420bc28686a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ec8253156d69..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=27648": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8eb952a2f70b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5f4e26b927c4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6567393424e9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0de485aa91ed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 382626229f94..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=32768": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f339e51540b0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1c29335eaf49..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 164746c525a9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3456": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 32f4a2527a5f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bf9fccfd9628..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=36864": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5afaf9370a75..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=4096": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 434e65d6010f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5b05b7d563cb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=4608": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ed84d41e515..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 57f7208d5b31..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a39eee775813..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 386dcefdd9ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 404b2e3e7143..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=5632": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f0f167c7f637..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6144": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d5caa9380998..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 03c28033d268..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e5055c7016d7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=64256": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 64efb8751002..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6848": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 090a44e33153..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 47231de840f8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0b6943c5df2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bcfe34389c8e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=1,hidden_size=9216": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9f2c62aa7bf3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f205785ac354..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b1ae9af607c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 21f6c7c4d2c1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 80432fa22304..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=11008": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 431749d79e07..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=1152": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e4cf65f2b466..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=128": { - "BLOCK_K": 64, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 456cdd0cbfa7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 657d4601e38d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3fdc882f8d4d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f16d8e73b04f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cdb9f921d9e2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a4a2728ea3f7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e250ad59c685..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1fe78e780ab8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ea5d35272955..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c24d3ebad8f6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 60d16b61c097..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1ac220f8cdb4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 593b6236bedc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=2560": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e02f41487062..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0e21dad2a1bb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e7de2732db5d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4594326f1214..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=3072": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 476b94614e61..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3394e0b2b2e2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 47b57eaba2d9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2095dc93481f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=32768": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d7ae5539e292..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=33024": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 392502ed9bef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=3328": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index caaf6dd953b3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 34fa2b5fc43e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 71e95b2a3456..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1506da8d5b06..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=4096": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 82dd0f4c7d44..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f97d81fa1054..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=4608": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 985c8781d3a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7ca73c314f25..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1a63f1a720a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=5120": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 022bfd82a54b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=5504": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9b22d0042659..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4075a65a0a7c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=6144": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index eabfa5752d0a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=6400": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4c12481b8079..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=64000": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e30688c4abd2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2e29f14a92cd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 13ff36677840..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8539aa72ab90..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=7168": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 53b66c493ea1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a6dfe596884f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=104,hidden_size=9216": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 602ffc66510d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fafea8e3786d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=10240": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1eec6dc77df8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dedb0618406f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=102656": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index de1dcf78c395..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=11008": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 73bc95410c42..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c7834800f885..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 65868fe3b2f4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6c18618b318f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128000": { - "BLOCK_K": 256, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a036606de7cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 89df4c522fae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=13824": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a142ada1ad07..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=14336": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9bb33d368fef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3a988a776e6f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=15360": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a57d196ba0ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c6aa7bf7b99b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 83c7c4e4c9a2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 427c0106c6ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=24576": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d47ff5bbb529..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6f1afccc1299..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2560": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 61a1b944f16c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6423b56c688d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index aa5170de0248..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ae0f48e917f2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6897b5908a5b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4c40c7bfc193..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8b38bafedc79..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 81d3642fe4ec..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d5263b43cc15..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=33024": { - "BLOCK_K": 256, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 35d23d34e4a7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d72e173b284f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a6387b5fc49e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=3584": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d80742511305..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b27c32ff573f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=4096": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index af6c84968dea..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6da56b5f85f0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index aa2f1263faae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f3b5f24c791b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dde06af0de34..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2dc362e1e1d4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5504": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76853f696c19..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=5632": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d498238ea795..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6144": { - "BLOCK_K": 256, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fc10d7616de9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 60aed7116c3d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 98bccfc678a1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 71bf22fc018b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0cbc42cfc86f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=6912": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ebab6f621840..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=7168": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7f2582d5fabb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=8192": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 37a66a2651be..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=16,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cb914f7d052d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3cbfe81a8241..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 01318faf6258..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=102400": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7201e38109c1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d36bf9a65f9e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=11008": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 01a390c790a7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 068bf8ed0f4e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7172561ad91a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c39444441044..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0fe8be7dd9e4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=128256": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 463a4228b8b6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1c566ee67de8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=14336": { - "BLOCK_K": 32, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bd612e59861b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c6a18389e22b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index de611d52747f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7692a7d496a3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 67533819f2b7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e7df88ce3d48..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=24576": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d60a540f6c9f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d4f9c110fc67..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2560": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index db4554d192bc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=27392": { - "BLOCK_K": 32, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e468805dd6bf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ffbb85f3cc9d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 77046010f9f9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 101a1fde2e17..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97ecc56f5a4b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f1f3fa79859c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f26ad0121efa..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 08f2719e2a70..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 19650dbe5c76..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 43b03a77fccf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3456": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5ea0fd9b7fb1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97e4bf17acb5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=36864": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b201e52f3f1a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=4096": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d79ac16a16e0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9d1bc041bfa0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=4608": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index af3fbc6aa009..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=49152": { - "BLOCK_K": 32, - "SPLIT_K": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3e6ab8e331e8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3824f7dc6657..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5120": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d1fd2e96658a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5504": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d9642bf1e5bf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=5632": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 89e5b64ce96b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6144": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2345b7e5e882..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6de6f1c12486..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b6c138a55f28..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4a63ed737b7a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ee7b427fbae6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 345cb9192105..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=7168": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 91dabeb15527..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b90e9970c8c0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=2,hidden_size=9216": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d750dbc94ebf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 422be89522eb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=10240": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a43fa1f140e9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 89e3dac0a31e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8c75da6742a4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=11008": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 81d9eeae3f90..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 32c16964bada..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cd66a4abdfdd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6e95d75f0b7a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ffa41368045..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=128256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0c993ab8ef41..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=13824": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4bef43ff2a51..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 382f702b15f5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1b6fb8d0262c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9f9ab7d1da6f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2ca6015cc8e2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c01c77b849ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1ffc10aab8a9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=24576": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9c90c8e5d3d6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a4a5aff5f9bf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2560": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bfad4944a155..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=27392": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f377b9487847..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=2752": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 294a45493519..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5dd926e91002..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7b41f769bfea..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2f376b9d3d99..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5a03b6ba2242..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a093a839dd4e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 904257e85be9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4f41954d2c44..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3328": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7b430bb3ef84..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5545e8a9bb58..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=3584": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76e30d5c6721..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=36864": { - "BLOCK_K": 256, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 60adb9b594de..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=4096": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 70fac02be5f4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fac51807dd14..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f8af00a3f0b2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=49152": { - "BLOCK_K": 256, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ec0716115b72..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5bfce900dd59..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 142a66db84ff..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1e819ccf88b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=5632": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 640dfe5c4aab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3f9f49c856ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ee67fafb3f47..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a4c7d1951a2d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=64256": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2668afddbf9d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 432191c74e6e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 965794c29584..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4099c8b31019..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 54f120427642..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=24,hidden_size=9216": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bc0d77a48b76..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4d245dd8606c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=10240": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 15f2726ffb7f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=102400": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 610e795d0889..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 86f0bd139844..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=11008": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 525db2cd9c83..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f154c1ac63ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4d9ca99f621b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4ce6fea19d94..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 61b5a2e19b2a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=128256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ee27b7eb7145..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=13824": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f1c0786c18a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2e80cd18b7e3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7ebd2698a89..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=15360": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a70cbb63546d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 411a6d693f37..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7bf4485bdafb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4b83542ed831..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fcdf01c82b6e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 27fa6e4ab908..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2560": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f32c6f35e55b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97e139922a3b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 633e173ede3d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=27648": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4b2853e27e6d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3072": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dbca49ebf47f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bbbb809b4bac..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32256": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2db20b0f523a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b76c39dbc79c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=32768": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95de274c392b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 31cc0f0988df..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3328": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d5848f9c0dc2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 928c4793a1b4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=3584": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 87333a2977e6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a13f1ae708cc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=4096": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e263c22b48c8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 28ccb7922928..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 74bdc063f829..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=49152": { - "BLOCK_K": 32, - "SPLIT_K": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e65f07e0c1b7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 86b22822e193..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 068d658420a2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bb22b22c0cf4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=5632": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 693e8b466480..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6144": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 72e213c9c841..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8983a00a5dc0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7ebc1a433047..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=64256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index baf8710e7904..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6848": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 83d82eab352e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=6912": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ea365d60b5f7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e0e266cba149..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index df672f000e51..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=32,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 21b7f37b6f10..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1024": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6780bd1b6061..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c8d5d48f3067..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8092de1d055e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=102656": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 25352c1f541a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=11008": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 72a9af0df017..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1152": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8b8a4d4ba0a6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5a63ccf5ea27..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1280": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f5322f696374..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128000": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9c0b7751d2a2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3645b8d9bc6e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f8e1776dbeac..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 67a3fb9e41c7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d145bf3a8058..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=15360": { - "BLOCK_K": 256, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 17bbd1c727ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2048": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 50bb33d9ddc2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=22016": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 28de03a38564..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2304": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e8696121d871..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f24f53814e53..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 414e8136350b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2560": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0a7f1ad50ca..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=27392": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90b14d89c238..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dfb4a2036591..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=27648": { - "BLOCK_K": 32, - "SPLIT_K": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index afcc971891fd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 86627bb4c2b6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 256, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 18f2e8d01075..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 60216da8b12c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3f3d7c814ec2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=32768": { - "BLOCK_K": 32, - "SPLIT_K": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 11f41c734aed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=33024": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b7b78e643da8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dd301c41083a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3456": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 00e16062556b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90b71d6a48aa..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4567fabf9fa3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=4096": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d758bd2bd5d4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4dd89a2baa91..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=4608": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4e2ea6dc099c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 571e5f11fb3a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 72fbed4df169..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cdebe9fa2d4d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5504": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b3315ac27857..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0f58f063add0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 36435a35bd1f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a3d53ddfb20c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9ec3fad337d0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 48399c1da49f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90a1a1dc7123..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 490f4fe0958a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=7168": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7dfbd884de1e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 423a0a8bb660..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=4,hidden_size=9216": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a23caa15d938..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=1024": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index da975d0f7f04..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 765e06971801..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ad02b596b368..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=102656": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ea35e6b4067..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=11008": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3c5b43290284..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d1d0c2952eec..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7ac3b09650e6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7e364a12309c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 362a4794b89d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6daf2db30092..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d39676778929..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ca1458693a59..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5caff317b920..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=15360": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bd333dbdea1b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5b17698cf941..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8c34a9ffe746..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 628d5e31f820..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=24576": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 53510a4a3176..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 36e4a63d0806..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=2560": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e95f7c421e3c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=27392": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e9fde38aa4c1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 675598e6916c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=27648": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0c7c62a69f4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d38a57c5cb63..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=32000": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fbf651ee7ab1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f56745eaf5d9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c7e6acf8da4c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=32768": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e214f20d25dd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=33024": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c3a7b1b3d075..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 70c8b1a5cebe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7518d2514d3f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=3584": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8b8caf903283..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 24dfdeb31e27..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=4096": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 73974a4b5ec4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=43264": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 134f097ee092..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=4608": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 99fd65e94e97..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 256, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a085010d32df..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 50fa49ac53a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3444417e1cc2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=5504": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 15f14f8a9a1d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1c05dce95c3a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 56833977b005..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bfa3f47fa2eb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4991337f39f4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ad9494f2d0ef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 72ffcbf1889d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=6912": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c5ae6958ebdf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=7168": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bdd5edbfc87b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 610d6175f27e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=40,hidden_size=9216": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2548ac5d500e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 987dc9b61dc2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 848a752c0379..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=102400": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index faa00296ce21..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f280fe287d2b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 19ddc1d36abf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7f53ede4c6d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d3c33217c882..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a8fb320c7b01..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=128000": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e0a7c154a14f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=128256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c9b76cf2ea95..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=13824": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1fcf12b7eb96..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c335578c3dab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b9f810c13912..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=15360": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 215df482ca64..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 86e68481e14c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ee1aa6e358cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=2304": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f00777cd606a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0382b4bfcad6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0cd5ac59ca34..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=2560": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3e5825452ae7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=27392": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e9ebb39aaa52..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=2752": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0951d6150b20..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4f6ec8b3fcfc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0dc115f3588e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 55b7954dd82a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 786022725aab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a80125409b55..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c2c745a5b6ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=33024": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e8803b5163ed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=3328": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d3ee96ee2125..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=3456": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0ba06679888a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index db6774f5f21c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7ffc7684eaf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=4096": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6f775a7eaa65..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 29275d15516d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c1f3159e79ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=49152": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 43d2c867eb78..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3ef158578c25..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2fa3f9b2dd5a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=5504": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b7df5022eed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5150a9505d59..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=6144": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 15bd83f652b2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=6400": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index be3441337184..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5adbe1354608..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4d0c8340f94e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=6848": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f25843901a29..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e7f8fdd5c289..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=7168": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1c620ed90717..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ca7f79c797da..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=48,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b18ad9ef740f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7e5e16254b14..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 66b9016b80d6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 41267d76bb7b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8b1bdd081b38..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=11008": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1b86365ec8d8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a3605ec25c3e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 177e169046c1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index eefe22a77acf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=128000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6d4866d51277..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d25a6e558ef0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c13f346b0444..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 30fefd4cff6f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d70b173c3ae9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9686ba58b423..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ded89a74da24..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=22016": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1bf7575e2a95..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ff3b1a65cd88..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=24576": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 32b1dc19f9a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5a103600df73..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=2560": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 862580187e9b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 765b7fcf0597..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dad3be5ede3c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=27648": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5e9de4977a0d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 12c92e427c5d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cece35827652..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=32256": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 05a07685f4f8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 19b832c59326..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 13256a11de49..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=33024": { - "BLOCK_K": 256, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 71ca4b92a1c6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7a02f9a5753f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=3456": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 36ce27a20c7c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=3584": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e473965d0d89..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index aa7ad780b5ed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=4096": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d09de854e154..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c600f9a4a564..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=4608": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f2fcd1b21602..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a0f8e37807d3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9fbe3a6da66c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=5120": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 60d5c4dc40be..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=5504": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a36eb605fa4c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ad7b11ec6f27..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 528a7abd6ffd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5e1779ffba43..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c89b534e12a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=64256": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7943d0cfde5e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 06705f2aa342..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 21fcc4c55de4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=7168": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c7b2c16677ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 03d554a6d65e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=56,hidden_size=9216": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d0938a7af883..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 07bcdbf20094..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a3b3e1a40bd6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 64d9dace43e6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=102656": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5dfc981bc2f6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bbefa2f6fdc6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=1152": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c6f10cf4989f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4663289ea195..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=1280": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4f50abb8852a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a3cc61173170..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6a6d73ea7f2a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=13824": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7ed41fea026e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 230edf27c64b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cdd186cc4b9f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4f76011c145a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7798c41d0f06..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 49d5d5dd3ac6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9f700c6130b4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=24576": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4cc76fa38c81..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76ee8a7d6102..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=2560": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 71b40355f9d8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=27392": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7cdad6a26049..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a17c9772d2bb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=27648": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a6d8b40ac252..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=3072": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bc0b651ecd4c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c9c7dc3f4468..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6892c863631b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e2f661dda26c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7fa15ee16716..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=33024": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 95868bcb7456..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4413b0d3675b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=3456": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5899404b9634..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=3584": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d89c80fb30b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 76c2211485c3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=4096": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b6f398c0b076..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index acec64f55cd6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=4608": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7163d4ab8c39..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0925d222a787..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 97ea276d1953..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d8383bc693fe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=5504": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7922df8c0829..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=5632": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fa5851fd7502..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=6144": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bd05c1814c89..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 797f53636838..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3a1619a239c1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=64256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e953b5c67710..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 423d7b24f01d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7f10bd0d49f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 435dadeca1af..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f346f15c2e23..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=64,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0c893f180a10..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1e4afde2e512..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 978cd7ab7325..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c7bdb214bd4c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=102656": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6d6f21405c65..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=11008": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 80cf30cfef96..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f8103e769b18..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=128": { - "BLOCK_K": 64, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c3d44a73f35b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=1280": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cae90f69a184..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bba9830ad8b8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3107dffe8924..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a0e14bab1eaf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5373e616c435..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f891ab67fecf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=15360": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b0a9bcbf635d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=2048": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 400122f387f5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6b0e09332d14..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=2304": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7c718f80e1b4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 109cc3a80846..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 66199975e837..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=2560": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e8654beb8e5d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=27392": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7f2de73261ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5d07e0d228b6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=27648": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 08c31b8980d1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=3072": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3b025579accd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 70b96ee453df..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 844d7ed28415..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 11201ec67bde..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e2ced87f84be..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=33024": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 00c493a9d303..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=3328": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 463c4846743a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=3456": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c9798336f74f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e6e44d69d91b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=36864": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bd730acc7051..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=4096": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e13c0d95b195..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f1a4e393519c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2ef93f449226..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0004a05e8a1e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dc50b17d249d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=5120": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index bf12510a5425..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2ac3724620af..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=5632": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e4ab9dc8b8ba..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8383761bc837..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=6400": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 097ee2ec6574..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6ec204c9c111..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=64256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 847e461c0323..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e46f913737b3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 754a87f1bbe8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=7168": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 83cb91d60ca7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7081cf4076bb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=72,hidden_size=9216": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 247e28d35bd5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ea4f2a102c02..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=10240": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8ec9a4f020e6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d7e734b9dbfe..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=102656": { - "BLOCK_K": 512, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index daf1cb8ae2b0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 128, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 002abc7662a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 92c8b78468f9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d901fce7e335..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 093e42b6513f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4443e8cbbf33..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fc23807fa76e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ddbdd089d658..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=14336": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e35893b40242..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=1536": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 37acdda7a634..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=15360": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ab9441a7b3ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 59c73ba026de..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=22016": { - "BLOCK_K": 256, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 002f8b5c8968..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4ae4d6f4734c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4e9bf0aba106..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1d191860b79c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2560": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 23761e39d7ad..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=27392": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cc645e813758..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=2752": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ae6226d1cb21..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=27648": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 88139aaaf02c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3072": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3f05658cfb57..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4fbf55d8bb05..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32256": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4064cd359317..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32512": { - "BLOCK_K": 256, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ce19b767a77d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d96efbb58943..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 16, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8f68f4280c29..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3328": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ec977c3530ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3456": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7aa5634fcd48..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=3584": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b571892198ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=36864": { - "BLOCK_K": 32, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7bb972163a11..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=4096": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ae17c8ecf5e6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d1f32242519d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=4608": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2a5dd4740ffb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=49152": { - "BLOCK_K": 256, - "SPLIT_K": 64, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dbf21f5fa1e0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e920344f2420..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5120": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 79940e1927b2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5504": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8a35fdbbafd7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c6e6a52180d3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e9d33ae6f038..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6400": { - "BLOCK_K": 32, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index be5e4ab7d032..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 64, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 711407b0620f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4f73d54aa992..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6848": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8c21ae9405a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d5e3f555a677..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 68faf2a604da..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=8192": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 29f03a383aca..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=8,hidden_size=9216": { - "BLOCK_K": 128, - "SPLIT_K": 32, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3acf2172ddb3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 643a627ea0d4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=10240": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d99bce723687..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 877a33b65222..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 12ade6916fcb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4dc0e71441f8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f21a68e8ee83..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5152f0b3ff4e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3953082729b0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=128000": { - "BLOCK_K": 256, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7524f10fca70..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=128256": { - "BLOCK_K": 256, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2064e9bd9b5b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f5348113634c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=14336": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0dfe95f6c31b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=1536": { - "BLOCK_K": 128, - "SPLIT_K": 8, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6d25ff48d801..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=15360": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6f86b7098d3e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=2048": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ac0d8fe29ee7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=22016": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index eebb376f205b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=2304": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1e45d954518c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 702d10096436..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e89e84d4deed..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=2560": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 78683762005e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=27392": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0da685e67d6d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=2752": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 53017d8ee495..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7c05ddd4194c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ebd7f65eac4a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3541f0b9a3f0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 20d55b196608..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=32512": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 96210fec220f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 125b434ef45b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b04fbbdda9de..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=3328": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ce1b418a958c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=3456": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ea1c5a006c8f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 40ec1fab7bba..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=36864": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4c361c17fe59..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=4096": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0dd7c3d37d70..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=43264": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8723dad79e62..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9415037ed7f7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=49152": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 13eb0b2756ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index dc583685ab02..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=5120": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 23fb68ff8153..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b085e5316888..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=5632": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 812297caf2a0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=6144": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 54d92a85b1f6..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 405c2e868728..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=64000": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c66a7e5f0ccf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6747ed1a08c7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=6848": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 31651fc1faa2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ed2fa7ce6d18..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b5546e695dc9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=8192": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7c44a9ae81b9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=80,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f39f6414abd3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=1024": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 841902cc2ff0..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index f7aa2b2768cb..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=102400": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 07e42e00e844..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=102656": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 830764450db5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=11008": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3a5efc527c8b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=1152": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index becb0e603976..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=128": { - "BLOCK_K": 64, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b44b8c3d8180..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=1280": { - "BLOCK_K": 32, - "SPLIT_K": 16, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ddfcfb01c7b7..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6719e38fba98..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 0f52b8c9cc0f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=13824": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9cc14f02017d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=14336": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c50b3242921a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7e98f105086a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=15360": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index aa6bee2870fc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=2048": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 21096640cb13..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a50aa2e0363c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c3364686564e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b7ff93819113..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8d10bf69031c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=2560": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 096b8d320b72..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=27392": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 684d38d2811c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2bd01194d5ce..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a7d00956f02c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 886d8ea5f4ab..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=32000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d8e6b13dbe92..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c62e742fa961..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e1d739d0a49d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=32768": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 601d128de45a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=33024": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5dd67e2690f9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b7d9fe07cfb5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=3456": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ca7f97699ec8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 005f4af2dd66..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=36864": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b97db9b80ccc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=4096": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fab5f7de4715..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=43264": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 78ca804fa3d1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=4608": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 524c1118598f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=49152": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 387b87a065a8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 34ce46ce03d2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=5120": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 148080894721..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=5504": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c75811679466..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=5632": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fdc0c3cfa0dd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=6144": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c86867594102..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8e4ca4b8d8a1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=64000": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 206e6e2d37e4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ab33b78848ec..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=6848": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e06af8cd8cfd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=6912": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index b775ea143b36..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=7168": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 9300b4bdb8f1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3c604544d052..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=88,hidden_size=9216": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 068b851caa7f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=1024": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 378704ab28cf..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=10240": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 90174392ce3e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=102400": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 112850a0b030..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=102656": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 77f512f41fb9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=11008": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 1f3f1b604a4d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=1152": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3124fa86ce0a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=128": { - "BLOCK_K": 128, - "SPLIT_K": 1, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fea9bf5bd3a5..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=1280": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index df330fd200a9..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=128000": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3753f062cd04..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=128256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5aee58b4062e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=13824": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2db120babc1f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=14336": { - "BLOCK_K": 64, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 11a26c11166c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=1536": { - "BLOCK_K": 64, - "SPLIT_K": 32, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 100067e93785..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=15360": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5d9db82e5288..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=2048": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index abc9c95e8b22..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=22016": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 66be2017f0ae..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=2304": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 45873b198e73..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=24576": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index ff32d7268f4e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=256": { - "BLOCK_K": 32, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c404185f47ef..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=2560": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 45dc37cd6c1e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=27392": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 354f9cda513e..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=2752": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3ce20813a940..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=27648": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index fcec90b796f8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=3072": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 06035f8733b3..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=32000": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c5c04329190b..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=32256": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 89bdb176ebc2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=32512": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index faab7a47840a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=32768": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 5b56d69c403f..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=33024": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 4b0e1c5badfa..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=3328": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 600943e1897d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=3456": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6690b75db842..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=3584": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index e584bc28dd7d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=36864": { - "BLOCK_K": 256, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 7b09b5d1a65c..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=4096": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 98d8cebcbf73..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=43264": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c63bbc7f882d..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=4608": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a36a9b36aa45..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=49152": { - "BLOCK_K": 64, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index d24898f0d4c4..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=512": { - "BLOCK_K": 32, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 56f8409d1aee..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=5120": { - "BLOCK_K": 32, - "SPLIT_K": 8, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 589965340a56..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=5504": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index cf5e15814824..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=5632": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 2034cf5ea634..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=6144": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index a2fb7b122395..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=6400": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 3128d88fe9bc..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=64000": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 751604b796cd..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=64256": { - "BLOCK_K": 128, - "SPLIT_K": 2, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index c5d3fc706dc8..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=6848": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 4 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 26748fd2b2b1..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=6912": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 8b0abb8d3cb2..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=7168": { - "BLOCK_K": 128, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 46756b2b589a..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=8192": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 8 - } -} diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json deleted file mode 100644 index 6dd33a999d46..000000000000 --- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "batchs=96,hidden_size=9216": { - "BLOCK_K": 64, - "SPLIT_K": 4, - "num_warps": 4 - } -} From b345434581e042a296168a0efb4cca66631af1b9 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 14 Jun 2024 14:18:19 +0800 Subject: [PATCH 22/71] add default config --- tests/lora/test_triton_punica.py | 3 +++ vllm/lora/ops/bgmv_expand.py | 5 +++-- vllm/lora/ops/bgmv_expand_slice.py | 4 +++- vllm/lora/ops/bgmv_shrink.py | 4 +++- vllm/lora/ops/utils.py | 26 ++++++++++---------------- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index a098aba16456..29df528cdf05 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -325,6 +325,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, # assert_close(our_out_tensor, ref_out_tensor) +@pytest.mark.skip("stop") @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @@ -469,6 +470,7 @@ def test_triton_bgmv_punica_bgmv( assert_close(our_out_tensor, ref_out_tensor) +@pytest.mark.skip("stop") @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) @@ -547,6 +549,7 @@ def test_sgmv_expand_slice( assert_close(our_outputs, ref_outputs) +@pytest.mark.skip("stop") @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 04fdd670243d..b977540cbfb4 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -5,10 +5,12 @@ https://arxiv.org/abs/2310.18547 """ +from typing import Dict, Optional + import torch import triton import triton.language as tl -from typing import Dict, Optional + from .utils import get_lora_op_configs @@ -137,7 +139,6 @@ def bgmv_expand( torch.bfloat16, ]: CAST_TYPE = True - config = {"BLOCK_N": 64, "SPLIT_N": 8} batchs = lora_indices_tensor.size(0) if override_config: diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index becaf4f1ca07..c741d10e9c9d 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -5,10 +5,12 @@ https://arxiv.org/abs/2310.18547 """ +from typing import Dict, Optional + import torch import triton import triton.language as tl -from typing import Any, Dict, Optional + from .utils import get_lora_op_configs diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 99b9d7ee5b9f..a7087a96488f 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -5,10 +5,12 @@ https://arxiv.org/abs/2310.18547 """ +from typing import Dict, Optional + import torch import triton import triton.language as tl -from typing import Dict, Optional + from .utils import get_lora_op_configs diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index f4e71cb110bb..6124916cfd9d 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -1,7 +1,7 @@ import functools import json import os -from typing import Dict, Optional +from typing import Dict def _get_config_file_name( @@ -11,16 +11,12 @@ def _get_config_file_name( ) -> str: # device_name = torch.cuda.get_device_name().replace(" ", "_") device_name = "NVIDIA_GeForce_RTX_3090" - return ( - f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " - + f"device_name={device_name}.json" - ) + return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " + + f"device_name={device_name}.json") @functools.lru_cache -def _get_op_configs( - op_type: str, batch: int, hidden_size: int -) -> Optional[Dict[str, int]]: +def _get_op_configs(op_type: str, batch: int, hidden_size: int): FOLDER_NAME = "bgmv_configs" json_file_name = _get_config_file_name(op_type, batch, hidden_size) @@ -32,24 +28,22 @@ def _get_op_configs( if os.path.exists(config_file_path): with open(config_file_path) as f: tuned_config = json.load(f).get( - f"batchs={batch},hidden_size={hidden_size}", None - ) + f"batchs={batch},hidden_size={hidden_size}", None) return tuned_config - + # If no optimized configuration is available, return None return None def _get_default_config(op_type: str, batch: int, hidden_size: int): if op_type == "expand": - return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8} + return {"BLOCK_N": 256, "SPLIT_N": 64, "num_warps": 8} else: - return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8} + return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8} -def get_lora_op_configs( - op_type: str, batch: int, hidden_size: int -) -> Dict[str, int]: +def get_lora_op_configs(op_type: str, batch: int, + hidden_size: int) -> Dict[str, int]: config = _get_op_configs(op_type, batch, hidden_size) if not config: config = _get_default_config(op_type, batch, hidden_size) From 00e007695c8cfa466f53fa74a0a601aa42a10cd7 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 14 Jun 2024 14:20:54 +0800 Subject: [PATCH 23/71] add default config --- tests/lora/test_triton_punica.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 29df528cdf05..1a5fd9e3f4d7 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -325,7 +325,6 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, # assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.skip("stop") @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @@ -469,7 +468,6 @@ def test_triton_bgmv_punica_bgmv( ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) - @pytest.mark.skip("stop") @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @@ -549,7 +547,6 @@ def test_sgmv_expand_slice( assert_close(our_outputs, ref_outputs) -@pytest.mark.skip("stop") @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) From f4bd5804a0cc89c773d62de98ca05d8e0f3a7707 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 14 Jun 2024 15:20:16 +0800 Subject: [PATCH 24/71] test conflict --- vllm/worker/model_runner.py | 325 ++++++++++++++++++++---------------- 1 file changed, 177 insertions(+), 148 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a3e52a749fb6..476e9ba3bb46 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,5 +1,7 @@ +import gc import time import warnings +from collections import defaultdict from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union import numpy as np @@ -11,16 +13,17 @@ ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.communication_op import graph_capture +from vllm.distributed.parallel_state import graph_capture from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData, - SequenceGroupMetadata) +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip, is_pin_memory_available, make_tensor_with_pad) @@ -34,6 +37,7 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) ] +_NUM_WARMUP_ITERS = 2 class ModelInput(NamedTuple): @@ -44,7 +48,7 @@ class ModelInput(NamedTuple): query_lens: List[int] lora_mapping: Optional[LoRAMapping] lora_requests: Set[LoRARequest] - multi_modal_input: Optional[torch.Tensor] + multi_modal_kwargs: Dict[str, torch.Tensor] slot_mapping: torch.Tensor num_prefill_tokens: int num_decode_tokens: int @@ -60,7 +64,7 @@ def empty(cls, device): query_lens=[], lora_mapping=None, lora_requests=set(), - multi_modal_input=None, + multi_modal_kwargs={}, slot_mapping=torch.empty(0, device=device), num_prefill_tokens=0, num_decode_tokens=0, @@ -122,6 +126,16 @@ def __init__( self.block_size, ) + # Create processor for multi-modal data + if self.vision_language_config is not None: + self.multi_modal_input_processor = MULTIMODAL_REGISTRY \ + .create_input_processor( + self.model_config, + self.vision_language_config, + ) + else: + self.multi_modal_input_processor = None + # Lazy initialization self.model: nn.Module # Set after load_model # Set if the backend is flashinfer. @@ -209,6 +223,16 @@ def save_sharded_state( max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: TensorizerConfig, + ) -> None: + from vllm.model_executor.model_loader.loader import TensorizerLoader + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + ) + def get_max_block_per_batch(self) -> int: block_size = self.block_size return (self.max_seq_len_to_capture + block_size - 1) // block_size @@ -233,7 +257,6 @@ def _prepare_model_input( input_positions: List[int] = [] slot_mapping: List[int] = [] lora_index_mapping: List[int] = [] - batch_lora_index_mapping: List[int] = [] lora_prompt_mapping: List[int] = [] lora_requests: Set[LoRARequest] = set() @@ -243,7 +266,8 @@ def _prepare_model_input( context_lens: List[int] = [] query_lens: List[int] = [] block_tables: List[List[int]] = [] - multi_modal_input_list: List[torch.Tensor] = [] + multi_modal_kwargs_list: Dict[str, + List[torch.Tensor]] = defaultdict(list) decode_only = True num_prefills = 0 num_prefill_tokens = 0 @@ -270,6 +294,12 @@ def _prepare_model_input( if len(seq_group_metadata_list) == 0: return ModelInput.empty(self.device) + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window + self.block_size - + 1) // self.block_size + block_aligned_sliding_window = \ + sliding_window_blocks * self.block_size + for seq_group_metadata in seq_group_metadata_list: seq_ids = list(seq_group_metadata.seq_data.keys()) is_prompt = seq_group_metadata.is_prompt @@ -310,6 +340,30 @@ def _prepare_model_input( and self.sliding_window is None and is_prompt) + # These are seq_len/context_len capped to the sliding window. + # They are passed to decode kernel. + # We still need original seq_len/context_len to compute slot + # mapping (and input position) below. + curr_sliding_window_blocks = None + sliding_seq_len = seq_len + sliding_context_len = context_len + + # TODO(sang): This is a hack to make sliding window work with + # paged attn. We can remove it if we make paged attn kernel + # to properly handle slinding window attn. + if (self.sliding_window is not None and not is_prompt): + curr_sliding_window_blocks = sliding_window_blocks + if self.scheduler_config.use_v2_block_manager: + # number of elements in last block + suff_len = seq_len % self.block_size + sliding_seq_len = min( + seq_len, block_aligned_sliding_window + suff_len) + if suff_len > 0: + curr_sliding_window_blocks += 1 + else: + sliding_seq_len = min(seq_len, self.sliding_window) + sliding_context_len = sliding_seq_len - 1 + # TODO(sang): Combine chunked prefill and prefix caching by # only allowing multiple of block_size chunk size. # NOTE: This only works for oooooooxxx style attention. @@ -317,6 +371,13 @@ def _prepare_model_input( assert computed_block_nums is not None context_len = len(computed_block_nums) * self.block_size tokens = tokens[context_len:] + + # need to think what to set it to when we have both sliding + # window and prefix caching... + assert self.sliding_window is None, \ + "Prefix caching is not supported with sliding window" + sliding_context_len = context_len + if self.attn_backend.get_name() == "flash-attn": # NOTE(woosuk): For flash-attn, the block table should # include the entries for the incoming prefill tokens. @@ -330,14 +391,9 @@ def _prepare_model_input( if seq_group_metadata.block_tables is not None: # chunked prefill or decode block_table = seq_group_metadata.block_tables[seq_id] - if self.sliding_window is not None: - # chunked prefill doesn't support sliding window. - assert (not self.scheduler_config. - chunked_prefill_enabled) - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - + if curr_sliding_window_blocks is not None: + block_table = block_table[ + -curr_sliding_window_blocks:] if self.attn_backend.get_name() == "flashinfer": paged_kv_indices.extend(block_table) paged_kv_indptr.append(paged_kv_indptr[-1] + @@ -355,16 +411,9 @@ def _prepare_model_input( block_table = [] block_tables.append(block_table) - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - if (self.sliding_window is not None and not is_prompt): - seq_len = min(seq_len, self.sliding_window) - context_len = seq_len - 1 - - seq_lens.append(seq_len) - context_lens.append(context_len) - query_len = seq_len - context_len + seq_lens.append(sliding_seq_len) + context_lens.append(sliding_context_len) + query_len = sliding_seq_len - sliding_context_len query_lens.append(query_len) input_tokens.extend(tokens) input_positions.extend(list(range(context_len, seq_len))) @@ -381,23 +430,29 @@ def _prepare_model_input( "seq_len: {}, context_len: {}, query_len: {}".format( seq_len, context_len, query_len)) num_decode_tokens += query_len - decode_seq_lens.append(seq_len) + decode_seq_lens.append(sliding_seq_len) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) lora_index_mapping += [lora_id] * query_len - batch_lora_index_mapping += [lora_id if lora_id > 0 else -1] lora_prompt_mapping.extend( [lora_id] * - (seq_len - - context_len if seq_group_metadata.sampling_params + (query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs - else 1)) + is not None else 1)) + + mm_data = seq_group_metadata.multi_modal_data + if mm_data is not None: + # Process multi-modal data + if self.multi_modal_input_processor is None: + raise ValueError( + "Multi-modal inputs are only supported by " + "vision language models.") - if seq_group_metadata.multi_modal_data: - multi_modal_input_list.append( - seq_group_metadata.multi_modal_data.data) + mm_kwargs = self.multi_modal_input_processor(mm_data) + for k, v in mm_kwargs.items(): + multi_modal_kwargs_list[k].append(v) if _is_block_tables_empty(seq_group_metadata.block_tables): # During memory profiling, the block tables are not @@ -419,9 +474,10 @@ def _prepare_model_input( start_idx = 0 if self.sliding_window is not None: if is_prompt: - assert context_len == 0, ( + assert self.scheduler_config.use_v2_block_manager \ + or context_len == 0, ( "Prefix caching is currently not supported with " - "sliding window attention") + "sliding window attention in V1 block manager") # It is an optimization. When it is decoding, it is always # 0. When prefill, we use it to not write slots to kv cache # to save memory. @@ -482,29 +538,6 @@ def _prepare_model_input( ) assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - - if multi_modal_input_list: - assert self.vision_language_config, ( - "Multi-modal inputs are only supported by " - "vision language models.") - multi_modal_input = torch.cat(multi_modal_input_list, - dim=0).to(self.device) - else: - multi_modal_input = None - - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - query_lens_tensor = torch.tensor(query_lens, - dtype=torch.long, - device=self.device) - query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int, device=self.device) @@ -512,11 +545,6 @@ def _prepare_model_input( dtype=torch.int32, device=self.device) - torch.cumsum(query_lens_tensor, - dim=0, - dtype=query_start_loc.dtype, - out=query_start_loc[1:]) - torch.cumsum(seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, @@ -569,6 +597,21 @@ def _prepare_model_input( seq_start_loc=seq_start_loc, data_type=kv_cache_dtype) else: + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + query_lens_tensor = torch.tensor(query_lens, + dtype=torch.long, + device=self.device) + query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + + torch.cumsum(query_lens_tensor, + dim=0, + dtype=query_start_loc.dtype, + out=query_start_loc[1:]) + attn_metadata = self.attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=slot_mapping_tensor, @@ -587,12 +630,18 @@ def _prepare_model_input( ) if self.lora_config: - lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping, - batch_lora_index_mapping, query_lens, - bool(attn_metadata.prefill_metadata)) + lora_mapping = LoRAMapping( + lora_index_mapping, + lora_prompt_mapping, + ) else: lora_mapping = None + multi_modal_kwargs = { + k: torch.cat(v, dim=0).to(self.device) + for k, v in multi_modal_kwargs_list.items() + } + return ModelInput( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, @@ -601,7 +650,7 @@ def _prepare_model_input( query_lens=query_lens, lora_mapping=lora_mapping, lora_requests=lora_requests, - multi_modal_input=multi_modal_input, + multi_modal_kwargs=multi_modal_kwargs, slot_mapping=slot_mapping_tensor, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -612,7 +661,7 @@ def prepare_input_tensors( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[LoRARequest], LoRAMapping, torch.Tensor]: + Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: if self.is_driver_worker: assert seq_group_metadata_list is not None # Prepare input tensors. @@ -624,7 +673,7 @@ def prepare_input_tensors( query_lens, lora_mapping, lora_requests, - multi_modal_input, + multi_modal_kwargs, slot_mapping, num_prefill_tokens, num_decode_tokens, @@ -641,7 +690,7 @@ def prepare_input_tensors( sampling_metadata.selected_token_indices, "lora_requests": lora_requests, "lora_mapping": lora_mapping, - "multi_modal_input": multi_modal_input, + "multi_modal_kwargs": multi_modal_kwargs, "num_prefill_tokens": num_prefill_tokens, "num_decode_tokens": num_decode_tokens, "slot_mapping": slot_mapping, @@ -658,7 +707,7 @@ def prepare_input_tensors( "selected_token_indices") lora_mapping = metadata_dict.pop("lora_mapping") lora_requests = metadata_dict.pop("lora_requests") - multi_modal_input = metadata_dict.pop("multi_modal_input") + multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") if metadata_dict: attn_metadata = self.attn_backend.make_metadata( **metadata_dict) @@ -673,7 +722,7 @@ def prepare_input_tensors( return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, - multi_modal_input) + multi_modal_kwargs) @torch.inference_mode() def execute_model( @@ -682,7 +731,7 @@ def execute_model( kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_input + lora_requests, lora_mapping, multi_modal_kwargs ) = self.prepare_input_tensors(seq_group_metadata_list) if self.lora_config: @@ -696,15 +745,14 @@ def execute_model( model_executable = self.graph_runners[graph_batch_size] else: model_executable = self.model - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "kv_caches": kv_caches, - "attn_metadata": attn_metadata, - } - if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) - hidden_states = model_executable(**execute_model_kwargs) + + hidden_states = model_executable( + input_ids=input_tokens, + positions=input_positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + **multi_modal_kwargs, + ) # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) @@ -760,16 +808,24 @@ def profile_run(self) -> None: # To exercise the worst scenario for GPU memory consumption, # the number of seqs (batch_size) is chosen to maximize the number # of images processed. - if self.vision_language_config: + model_config = self.model_config + vlm_config = self.vision_language_config + + if vlm_config: max_num_seqs = min( max_num_seqs, - int(max_num_batched_tokens / - self.vision_language_config.image_feature_size)) + int(max_num_batched_tokens / vlm_config.image_feature_size)) for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_data, fake_multi_modal_input = _prepare_fake_inputs( - seq_len, self.vision_language_config) + + if vlm_config is None: + seq_data = SequenceData([0] * seq_len) + dummy_multi_modal_data = None + else: + seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \ + .dummy_data_for_profiling(seq_len, model_config, vlm_config) + seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, @@ -778,7 +834,7 @@ def profile_run(self) -> None: block_tables=None, lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None, - multi_modal_data=fake_multi_modal_input, + multi_modal_data=dummy_multi_modal_data, ) seqs.append(seq) @@ -789,32 +845,6 @@ def profile_run(self) -> None: torch.cuda.synchronize() return - # def compose_lora_kernel_meta( - # self, - # attn_metadata: AttentionMetadata, - # ) -> LoRAKernelMeta: - # if attn_metadata.prefill_metadata: - # max_seq_len = attn_metadata.max_query_len - # seq_start_loc = attn_metadata.query_start_loc - # seq_lens_tensor = attn_metadata.seq_lens_tensor - # batch_size = attn_metadata.num_prefills - # else: - # max_seq_len = attn_metadata.max_query_len - # seq_start_loc = attn_metadata.query_start_loc - # batch_size = attn_metadata.decode_metadata.num_decode_tokens - # seq_lens_tensor = torch.ones((batch_size), - # dtype=torch.long, - # device=self.device) - - # if batch_size == 0: - # print("sssss") - # # lora_index_lst = lora_mapping.batch_mapping - # # lora_index_tensor = torch.tensor(lora_index_lst, - # # dtype=torch.long, - # # device=self.device) - # return LoRAKernelMeta(batch_size, max_seq_len, seq_lens_tensor, - # seq_start_loc) - def remove_all_loras(self): if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -876,6 +906,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() block_tables = torch.from_numpy(self.graph_block_tables).cuda() + # Prepare buffer for outputs. These will be reused for all batch sizes. + # It will be filled after the first graph capture. + hidden_states: Optional[torch.Tensor] = None + graph_batch_size = _get_graph_batch_size( self.scheduler_config.max_num_seqs) batch_size_capture_list = [ @@ -905,16 +939,18 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: ) if self.lora_config: - lora_mapping = LoRAMapping([0] * batch_size, - [0] * batch_size, - [0] * batch_size, - [1] * batch_size, False) + lora_mapping = LoRAMapping( + [0] * batch_size, + [0] * batch_size, + ) self.set_active_loras(set(), lora_mapping) graph_runner = CUDAGraphRunner(self.model) - graph_runner.capture( + hidden_states = graph_runner.capture( input_tokens[:batch_size], input_positions[:batch_size], + hidden_states[:batch_size] + if hidden_states is not None else None, kv_caches, attn_metadata, memory_pool=self.graph_memory_pool, @@ -951,35 +987,46 @@ def capture( self, input_ids: torch.Tensor, positions: torch.Tensor, + hidden_states: Optional[torch.Tensor], kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, memory_pool: Optional[Tuple[int, int]], stream: torch.cuda.Stream, **kwargs, - ) -> None: + ) -> torch.Tensor: assert self._graph is None - # Run the model once without capturing the graph. + # Run the model a few times without capturing the graph. # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). - self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - **kwargs, - ) + # Note one iteration is not enough for torch.jit.script + for _ in range(_NUM_WARMUP_ITERS): + self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + **kwargs, + ) torch.cuda.synchronize() # Capture the graph. self._graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): - hidden_states = self.model( + output_hidden_states = self.model( input_ids, positions, kv_caches, attn_metadata, **kwargs, ) + if hidden_states is not None: + hidden_states.copy_(output_hidden_states) + else: + hidden_states = output_hidden_states + del output_hidden_states + # make sure `output_hidden_states` is deleted + # in the graph's memory pool + gc.collect() torch.cuda.synchronize() # Save the input and output buffers. @@ -992,7 +1039,7 @@ def capture( "block_tables": attn_metadata.decode_metadata.block_tables, } self.output_buffers = {"hidden_states": hidden_states} - return + return hidden_states def forward( self, @@ -1039,24 +1086,6 @@ def _get_graph_batch_size(batch_size: int) -> int: _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) -def _prepare_fake_inputs( - seq_len: int, vision_language_config: Optional[VisionLanguageConfig]): - """Prepare fake inputs for profile run.""" - if vision_language_config: - prompt_tokens = [ - vision_language_config.image_token_id - ] * vision_language_config.image_feature_size + [0] * ( - seq_len - vision_language_config.image_feature_size) - fake_image_input = MultiModalData( - type=MultiModalData.Type.IMAGE, - data=torch.zeros(vision_language_config.image_input_shape, - dtype=torch.float16)) - else: - prompt_tokens = [0] * seq_len - fake_image_input = None - return SequenceData(prompt_tokens), fake_image_input - - def _is_block_tables_empty(block_tables: Union[None, Dict]): """ Check if block_tables is None or a dictionary with all None values. From 2bc0668bde9d8931bd6ec38d21558aeffc2f1a81 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 18 Jun 2024 11:33:12 +0800 Subject: [PATCH 25/71] trigger testing --- tests/lora/test_triton_punica.py | 2 +- vllm/lora/fully_sharded_layers.py | 238 +++++++++--- vllm/lora/layers.py | 591 ++++++++++++++++-------------- vllm/lora/models.py | 293 ++++++++------- vllm/lora/punica.py | 385 +++++++++---------- vllm/worker/model_runner.py | 294 ++++++--------- 6 files changed, 950 insertions(+), 853 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 1a5fd9e3f4d7..a098aba16456 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -468,7 +468,7 @@ def test_triton_bgmv_punica_bgmv( ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.skip("stop") + @pytest.mark.parametrize("batchs", BATCHS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", NSLICES) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index ffdc32b7339a..fbea667a215e 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -7,13 +7,22 @@ from vllm.config import LoRAConfig from vllm.distributed.communication_op import ( - tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) from vllm.distributed.parallel_state import get_tensor_model_parallel_rank -from vllm.lora.layers import (ColumnParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLora, - RowParallelLinearWithLoRA) +from vllm.lora.layers import ( + ColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithLoRA, + MergedQKVParallelLinearWithLora, + RowParallelLinearWithLoRA, +) from vllm.lora.punica import bgmv, dispatch_bgmv_low_level +from vllm.lora.punica import ( + add_shrink_triton, + add_expand_triton, + add_expand_slice_triton, +) if TYPE_CHECKING: pass @@ -27,7 +36,7 @@ def _fully_sharded_can_replace(can_replace): def dec(*args, **kwargs): return (can_replace(*args, **kwargs) - and kwargs['lora_config'].fully_sharded_loras) + and kwargs["lora_config"].fully_sharded_loras) return dec @@ -58,15 +67,49 @@ def apply(self, x: torch.Tensor, x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device) + buffer = torch.zeros( + (x.shape[0], self.lora_a_stacked.shape[2]), + dtype=torch.float32, + device=x.device, + ) - bgmv(buffer, x, self.lora_a_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + # bgmv( + # buffer, + # x, + # self.lora_a_stacked, + # self.indices[: self.indices_len[0]], + # 0, + # 1.0, + # ) + token_num = self.indices_len[0] + is_prefilling = bool(self.indices_len[4]) + add_shrink_triton( + buffer, + x, + self.lora_a_stacked, + self.indices[:token_num], + 0, + 1.0, + is_prefilling, + ) buffer = tensor_model_parallel_all_gather(buffer) - bgmv(output, buffer, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + # bgmv( + # output, + # buffer, + # self.lora_b_stacked, + # self.indices[: self.indices_len[0]], + # 0, + # 1.0, + # ) + add_expand_triton( + output, + buffer, + self.lora_b_stacked, + self.indices[:token_num], + 0, + is_prefilling, + add_input=True, + ) # now have column partitioned output output = output.view(*out_orig_shape) @@ -74,9 +117,13 @@ def apply(self, x: torch.Tensor, @classmethod @_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( source_layer=source_layer, @@ -89,12 +136,12 @@ def can_replace_layer(cls, source_layer: nn.Module, def _mcp_apply(x, bias, layer): """ - MergedColumnParallelLinearWithShardedLoRA and - QKVParallelLinearWithShardedLora share the same + MergedColumnParallelLinearWithShardedLoRA and + QKVParallelLinearWithShardedLora share the same LoRa weight application method. - + The main difference is the step by shard_size for lora_b which can - vary for QKVParallelLinearWithShardedLora but is constant for + vary for QKVParallelLinearWithShardedLora but is constant for MergedColumnParallelLinearWithShardedLoRA. """ # expecting 2 for column parallel and 3 for qkv @@ -103,21 +150,58 @@ def _mcp_apply(x, bias, layer): x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffers = torch.zeros((n, x.shape[0], layer.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device) + buffers = torch.zeros( + (n, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + token_num = layer.indices_len[0] + is_prefilling = bool(layer.indices_len[4]) for idx in range(n): - bgmv(buffers[idx], x, layer.lora_a_stacked[idx], - layer.indices[:layer.indices_len[0]], 0, 1.0) + # bgmv( + # buffers[idx], + # x, + # layer.lora_a_stacked[idx], + # layer.indices[: layer.indices_len[0]], + # 0, + # 1.0, + # ) + + add_shrink_triton( + buffers[idx], + x, + layer.lora_a_stacked[idx], + layer.indices[:token_num], + 0, + 1.0, + is_prefilling, + ) buffers = tensor_model_parallel_all_gather(buffers) left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] - dispatch_bgmv_low_level(output, buffers[idx], - layer.lora_b_stacked[idx], - layer.indices[:layer.indices_len[0]], 0, 1.0, - left_offset, shard_size) + # dispatch_bgmv_low_level( + # output, + # buffers[idx], + # layer.lora_b_stacked[idx], + # layer.indices[: layer.indices_len[0]], + # 0, + # 1.0, + # left_offset, + # shard_size, + # ) + add_expand_slice_triton( + output, + buffers[idx], + layer.lora_b_stacked[idx], + layer.indices[:layer.indices_len[0]], + 0, + is_prefilling, + left_offset, + shard_size, + add_input=True, + ) left_offset += shard_size output = output.view(*out_orig_shape) @@ -128,7 +212,7 @@ def _mcp_apply(x, bias, layer): class MergedColumnParallelLinearWithShardedLoRA( MergedColumnParallelLinearWithLoRA): """ - Differs from MergedColumnParallelLinearWithLoRA by slicing the + Differs from MergedColumnParallelLinearWithLoRA by slicing the LoRA A's also. Based on S-LoRA, slicing happens along the rank dim. @@ -144,7 +228,8 @@ def slice_lora_a( lora_a = [ lora_a[0][:, output_start_idx:output_start_idx + output_shard_size], - lora_a[1][:, output_start_idx:output_start_idx + output_shard_size] + lora_a[1][:, + output_start_idx:output_start_idx + output_shard_size], ] return lora_a @@ -154,9 +239,13 @@ def apply(self, x: torch.Tensor, @classmethod @_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( source_layer=source_layer, @@ -169,7 +258,7 @@ def can_replace_layer(cls, source_layer: nn.Module, class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): """ - Differs from QKVParallelLinearWithLora by slicing the + Differs from QKVParallelLinearWithLora by slicing the LoRA A's also. Based on S-LoRA, slicing happens along the rank dim. @@ -185,7 +274,7 @@ def slice_lora_a( lora_a = [ lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]], lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]], - lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]] + lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]], ] return lora_a @@ -195,9 +284,13 @@ def apply(self, x: torch.Tensor, @classmethod @_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( source_layer=source_layer, @@ -210,11 +303,11 @@ def can_replace_layer(cls, source_layer: nn.Module, class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): """ - Differs from RowParallelLinearWithLoRA by slicing the + Differs from RowParallelLinearWithLoRA by slicing the LoRA B's also. Based on S-LoRA, slicing happens along the output dim. - This yields a combined partial sum from the row parallel base + This yields a combined partial sum from the row parallel base layer and column partitioned output from the LoRA. """ @@ -231,11 +324,30 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device) - bgmv(buffer, x, self.lora_a_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + buffer = torch.zeros( + (x.shape[0], self.lora_a_stacked.shape[2]), + dtype=torch.float32, + device=x.device, + ) + # bgmv( + # buffer, + # x, + # self.lora_a_stacked, + # self.indices[: self.indices_len[0]], + # 0, + # 1.0, + # ) + token_num = self.indices_len[0] + is_prefilling = bool(self.indices_len[4]) + add_shrink_triton( + buffer, + x, + self.lora_a_stacked, + self.indices[:token_num], + 0, + 1.0, + is_prefilling, + ) buffer = tensor_model_parallel_all_reduce(buffer) # following S-LoRA, allows the fusing of all_gather and all_reduce @@ -246,18 +358,38 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size - dispatch_bgmv_low_level(output, buffer, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0, - start_idx, shard_size) - + # dispatch_bgmv_low_level( + # output, + # buffer, + # self.lora_b_stacked, + # self.indices[: self.indices_len[0]], + # 0, + # 1.0, + # start_idx, + # shard_size, + # ) + add_expand_slice_triton( + output, + buffer, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], + 0, + is_prefilling, + start_idx, + shard_size, + ) output = output.view(*out_orig_shape) return output @classmethod @_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( source_layer=source_layer, diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 96b37ab8880c..1dd89df3c4f6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -9,24 +9,34 @@ from transformers import PretrainedConfig from vllm.config import LoRAConfig -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce, - tensor_model_parallel_gather) +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, + tensor_model_parallel_gather, +) from vllm.distributed.utils import divide -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.punica import add_lora_triton, add_lora_triton_slice -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) + +# from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.punica import ( + add_lora_triton, + add_expand_triton, +) +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( - LinearScalingRotaryEmbedding, RotaryEmbedding) + LinearScalingRotaryEmbedding, + RotaryEmbedding, +) from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) + VocabParallelEmbedding, ) if TYPE_CHECKING: pass @@ -55,26 +65,49 @@ def _not_fully_sharded_can_replace(can_replace): """ def dec(*args, **kwargs): - decorate = kwargs.pop('decorate') if 'decorate' in kwargs else True - condition = (not kwargs['lora_config'].fully_sharded_loras + decorate = kwargs.pop("decorate") if "decorate" in kwargs else True + condition = (not kwargs["lora_config"].fully_sharded_loras if decorate else True) return can_replace(*args, **kwargs) and condition return dec +def _apply_expand_triton( + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + lora_index_tensor: torch.Tensor, + indices_info: List[int], + output: torch.Tensor, + add_input: bool = True, +) -> torch.Tensor: + org_output = output + x = x.view(-1, x.shape[-1]) + output = output.view(-1, output.shape[-1]) + token_num = indices_info[0] + is_prefilling = bool(indices_info[4]) + add_expand_triton( + output, + x, + lora_b_stacked, + lora_index_tensor[:token_num], + 0, + is_prefilling, + add_input, + ) + return output.view_as(org_output) + + def _apply_lora_triton( x: torch.Tensor, lora_a_stacked: torch.Tensor, lora_b_stacked: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, lora_index_tensor: torch.Tensor, - batch_mlen_stage_lst: List[int], + indices_info: List[int], output: torch.Tensor, ) -> torch.Tensor: - """Applies lora to each input. This method applies all loras to each - input. It uses the `lora_index_tensor` vector to determine which lora + """Applies lora to each input. This method applies all loras to each + input. It uses the `lora_index_tensor` vector to determine which lora yields the correct output. An index of -1 means no lora should be applied. This method adds the final lora results to the output. @@ -82,35 +115,33 @@ def _apply_lora_triton( x (torch.Tensor): (batch_size, hidden_dim) lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim) lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) - b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative - sequence lengths of the sequences in the batch, used to index - into sequence. E.g.,if the sequence length is [4, 6], it is - [0, 4]. - seq_length_tensor (torch.Tensor): batch_size,). record the sequence - length of the sequences in the batch - lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch - batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch - size, maximum seq length, and prefilling stage flag. - output (torch.Tensor): (batch_size, output_dim) - + lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA + index corresponding to each token + indices_info: List[int]: 5 is the number of indicies tensors. + # base_indices, sampler_indices, sampler_indices_padded, + # embeddings_indices,prefilling or decoding + output (torch.Tensor): (batch_size, output_dim) + Returns: - output (torch.Tensor): (batch_size, output_dim) - + output (torch.Tensor): (batch_size*seq_number, output_dim) + """ org_output = output x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - # - batch_size = batch_mlen_stage_lst[0] - max_length = batch_mlen_stage_lst[1] - is_prefilling = bool(batch_mlen_stage_lst[2]) - - add_lora_triton(output, x, lora_a_stacked, lora_b_stacked, - b_seq_start_tensor[:batch_size], - seq_length_tensor[:batch_size], - lora_index_tensor[:batch_size], batch_size, max_length, 0, - 1.0, is_prefilling) + + token_num = indices_info[0] + is_prefilling = bool(indices_info[4]) + add_lora_triton( + output, + x, + lora_a_stacked, + lora_b_stacked, + lora_index_tensor[:token_num], + 0, + 1.0, + is_prefilling, + ) return output.view_as(org_output) @@ -118,52 +149,46 @@ def _apply_lora_triton_nslice( x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, lora_index_tensor: torch.Tensor, - batch_mlen_stage_lst: List[int], + indices_info: List[int], output: torch.Tensor, output_slices: Tuple[int, ...], ) -> torch.Tensor: - """Applies lora to each input. This method applies all loras to each - input. It uses the `lora_index_tensor` vector to determine which lora - yields the correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the output. + """_summary_ Args: - x (torch.Tensor): (batch_size, hidden_dim) - lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim) - lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) - b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative - sequence lengths of the sequences in the batch, used to index - into sequence. E.g.,if the sequence length is [4, 6], it is - [0, 4]. - seq_length_tensor (torch.Tensor): batch_size,). record the sequence - length of the sequences in the batch - lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch - batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch - size, maximum seq length, and prefilling stage flag. - output_slices (Tuple[int, ...]): Size of each output column + x (torch.Tensor): _description_ + lora_a_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_ + lora_b_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_ + lora_index_tensor (torch.Tensor): _description_ + indices_info (List[int]): _description_ + output (torch.Tensor): _description_ + output_slices (Tuple[int, ...]): _description_ Returns: - output (torch.Tensor): (batch_size, output_dim) + torch.Tensor: _description_ """ org_output = output x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - batch_size = batch_mlen_stage_lst[0] - max_length = batch_mlen_stage_lst[1] - is_prefilling = bool(batch_mlen_stage_lst[2]) + token_num = indices_info[0] + is_prefilling = bool(indices_info[4]) offset_left = 0 - #TODO fuse these kernel + # TODO fuse these kernels for slice_idx in range(len(output_slices)): - add_lora_triton_slice( - output, x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], - b_seq_start_tensor[:batch_size], seq_length_tensor[:batch_size], - lora_index_tensor[:batch_size], batch_size, max_length, 0, 1.0, - offset_left, output_slices[slice_idx], is_prefilling) + add_lora_triton( + output, + x, + lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], + lora_index_tensor[:token_num], + 0, + 1.0, + is_prefilling, + offset_left, + output_slices[slice_idx], + ) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -175,10 +200,6 @@ class LoRAMapping: index_mapping: Tuple[int, ...] # Per sampled token: prompt_mapping: Tuple[int, ...] - # Per batch lora index - batch_mapping: List[int] = field(default_factory=list) - # Per batch seq length - seq_lens: List[int] = field(default_factory=list) # prefilling or decoding. is_prefilling: bool = False @@ -202,10 +223,11 @@ def slice_lora_b( ... def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: """Initializes lora matrices.""" ... @@ -224,18 +246,25 @@ def set_lora( ... def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): """Sets the mapping indices.""" ... @classmethod - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" raise NotImplementedError @@ -249,22 +278,23 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None: self.embeddings_weights: Optional[torch.Tensor] def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: lora_vocab_start_idx = self.base_layer.org_vocab_size weights_idx = None if self.base_layer.vocab_end_index > lora_vocab_start_idx: # We can start adding lora weights weights_idx = max( lora_vocab_start_idx - self.base_layer.vocab_start_index, 0) - self.embeddings_slice = (self.base_layer.vocab_start_index - - self.base_layer.org_vocab_size + - weights_idx, - self.base_layer.vocab_end_index - - self.base_layer.org_vocab_size) + self.embeddings_slice = ( + self.base_layer.vocab_start_index - + self.base_layer.org_vocab_size + weights_idx, + self.base_layer.vocab_end_index - + self.base_layer.org_vocab_size, + ) self.embeddings_weights = self.base_layer.weight.data[weights_idx:] self.embeddings_weights.fill_(0) else: @@ -309,10 +339,6 @@ def create_lora_weights( self.indices_len: List[int] self.embeddings_indices: torch.Tensor - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -334,30 +360,30 @@ def set_lora( if embeddings_tensor is not None: self.embeddings_tensors[ index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1]].copy_(embeddings_tensor, non_blocking=True) + shape[1], ].copy_(embeddings_tensor, non_blocking=True) if self.embeddings_slice is not None: # TODO(yard1): Optimize this copy, we don't need to copy # everything, just the modified part embeddings = self.embeddings_tensors.view( self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1], - self.embeddings_tensors.shape[2] + self.embeddings_tensors.shape[2], )[self.embeddings_slice[0]:self.embeddings_slice[1]] assert self.embeddings_weights is not None self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): self.indices = base_indices self.embeddings_indices = embeddings_indices self.indices_len = indices_len - self.seq_length_tensor = seq_length_tensor - self.b_seq_start_tensor = b_seq_start_tensor - self.batch_mlen_stage_lst = batch_mlen_stage_lst def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 @@ -378,34 +404,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if full_lora_a_embeddings.ndim == 3: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * - full_lora_a_embeddings.shape[1], -1) - batch_size, max_length = self.batch_mlen_stage_lst[ - 0], self.batch_mlen_stage_lst[1] - - sgmv_expand( + full_lora_a_embeddings.shape[1], + -1, + ) + _apply_expand_triton( full_lora_a_embeddings, self.lora_b_stacked, + self.indices, + self.indices_len, full_output, - self.b_seq_start_tensor[:batch_size], - self.seq_length_tensor[:batch_size], - self.indices[:batch_size], - batch_size, - max_length, - True, + add_input=True, ) return full_output.view_as(full_output_org) @classmethod - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: return type(source_layer) is VocabParallelEmbedding class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): """ LoRA on top of ColumnParallelLinear layer. - + LoRA B is sliced for tensor parallelism. """ @@ -418,10 +444,11 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None: self.device = _get_lora_device(self.base_layer) def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: self.lora_config = lora_config self.tp_size = get_tensor_model_parallel_world_size() lora_a_output_size_per_partition = ( @@ -448,9 +475,6 @@ def create_lora_weights( # lazily initialized. self.indices: torch.Tensor self.indices_len: List[int] - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -488,23 +512,28 @@ def set_lora( lora_b.T, non_blocking=True) def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): self.indices = base_indices self.indices_len = indices_len - self.seq_length_tensor = seq_length_tensor - self.b_seq_start_tensor = b_seq_start_tensor - self.batch_mlen_stage_lst = batch_mlen_stage_lst def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor, self.seq_length_tensor, - self.indices, self.batch_mlen_stage_lst, output) + _apply_lora_triton( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices, + self.indices_len, + output, + ) return output def forward(self, input_): @@ -533,9 +562,13 @@ def forward(self, input_): @classmethod @_not_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: return type(source_layer) is ColumnParallelLinear or ( type(source_layer) is MergedColumnParallelLinear and len(packed_modules_list) == 1) @@ -554,10 +587,11 @@ def __init__(self, base_layer: MergedColumnParallelLinear) -> None: super().__init__(base_layer) def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: self.lora_config = lora_config n_slices = 2 if not (len(self.base_layer.output_sizes) == n_slices @@ -597,11 +631,6 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: torch.Tensor - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 self.lora_a_stacked[1][index] = 0 @@ -622,7 +651,8 @@ def slice_lora_b( start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = [ - lora_b[0][:, start_idx:end_idx], lora_b[1][:, start_idx:end_idx] + lora_b[0][:, start_idx:end_idx], + lora_b[1][:, start_idx:end_idx], ] return lora_b @@ -661,10 +691,8 @@ def apply(self, x: torch.Tensor, x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor, - self.seq_length_tensor, self.indices, - self.batch_mlen_stage_lst, + self.indices_len, output, (self.output_dim, self.output_dim), ) @@ -672,22 +700,26 @@ def apply(self, x: torch.Tensor, @classmethod @_not_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: - return type(source_layer) is MergedColumnParallelLinear and len( - packed_modules_list) == 2 + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: + return (type(source_layer) is MergedColumnParallelLinear + and len(packed_modules_list) == 2) class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): """ - ColumnParallelLinear layer that is specifically designed for - qkv_proj. Certain models, such as chtglm3 and baichuan-7b, - only contains a single LoRA within their qkv_proj layer. + ColumnParallelLinear layer that is specifically designed for + qkv_proj. Certain models, such as chtglm3 and baichuan-7b, + only contains a single LoRA within their qkv_proj layer. - During inference with Tensor Parallel, the weights of lora_b + During inference with Tensor Parallel, the weights of lora_b must be accurately partitioned according to the respective ranks. - + Q slice may have different shape than K and V slices (which both have the same shape). """ @@ -718,15 +750,17 @@ def set_lora( self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas lora_b_q = lora_b[:, self.q_proj_shard_size * self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] + (self.q_shard_id + 1), ] k_offset = self.q_proj_total_size lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size * self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] + self.kv_proj_shard_size * + (self.kv_shard_id + 1), ] v_offset = k_offset + self.kv_proj_total_size lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size * self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] + self.kv_proj_shard_size * + (self.kv_shard_id + 1), ] lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) self.lora_a_stacked[index, @@ -737,11 +771,15 @@ def set_lora( lora_b.T, non_blocking=True) @classmethod - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: - return type(source_layer) is QKVParallelLinear and len( - packed_modules_list) == 1 + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: + return (type(source_layer) is QKVParallelLinear + and len(packed_modules_list) == 1) class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): @@ -759,10 +797,11 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: self.lora_config = lora_config self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() @@ -830,18 +869,17 @@ def create_lora_weights( ), ) - self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size, - self.kv_proj_shard_size) + self.output_slices = ( + self.q_proj_shard_size, + self.kv_proj_shard_size, + self.kv_proj_shard_size, + ) self.packed_indices: Optional[torch.Tensor] = None self.standard_indices: Optional[torch.Tensor] = None # lazily initialized. self.indices: torch.Tensor self.indices_len: List[int] - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 self.lora_b_stacked[0][index] = 0 @@ -862,15 +900,15 @@ def slice_lora_b( if lora_b[0] is not None: lora_b_q = lora_b[0][:, self.q_proj_shard_size * self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] + (self.q_shard_id + 1), ] if lora_b[1] is not None: lora_b_k = lora_b[1][:, self.kv_proj_shard_size * self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] + (self.kv_shard_id + 1), ] if lora_b[2] is not None: lora_b_v = lora_b[2][:, self.kv_proj_shard_size * self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] + (self.kv_shard_id + 1), ] lora_b = [lora_b_q, lora_b_k, lora_b_v] return lora_b @@ -923,10 +961,8 @@ def apply(self, x: torch.Tensor, x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor, - self.seq_length_tensor, self.indices, - self.batch_mlen_stage_lst, + self.indices_len, output, self.output_slices, ) @@ -935,11 +971,15 @@ def apply(self, x: torch.Tensor, @classmethod @_not_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: - return type(source_layer) is QKVParallelLinear and len( - packed_modules_list) == 3 + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: + return (type(source_layer) is QKVParallelLinear + and len(packed_modules_list) == 3) class RowParallelLinearWithLoRA(BaseLayerWithLoRA): @@ -952,10 +992,11 @@ def __init__(self, base_layer: RowParallelLinear) -> None: self.device = _get_lora_device(self.base_layer) def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: self.lora_config = lora_config self.tp_rank = get_tensor_model_parallel_rank() self.lora_a_stacked = torch.zeros( @@ -987,11 +1028,6 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: List[int] - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -1028,23 +1064,28 @@ def set_lora( lora_b.T, non_blocking=True) def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): self.indices = base_indices self.indices_len = indices_len - self.seq_length_tensor = seq_length_tensor - self.b_seq_start_tensor = b_seq_start_tensor - self.batch_mlen_stage_lst = batch_mlen_stage_lst def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) # maybe we need not restrict range to [:batch_size] - _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked, - self.b_seq_start_tensor, self.seq_length_tensor, - self.indices, self.batch_mlen_stage_lst, output) + _apply_lora_triton( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices, + self.indices_len, + output, + ) return output def forward(self, input_): @@ -1087,14 +1128,18 @@ def forward(self, input_): @property def weight(self): - return self.base_layer.weight if hasattr( - self.base_layer, "weight") else self.base_layer.qweight + return (self.base_layer.weight if hasattr(self.base_layer, "weight") + else self.base_layer.qweight) @classmethod @_not_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: return type(source_layer) is RowParallelLinear @@ -1177,11 +1222,6 @@ def create_lora_weights( self.indices_len: List[int] self.indices_padded: torch.Tensor - self.seq_length_tensor: torch.Tensor - self.b_seq_start_tensor: torch.Tensor - self.lora_index_tensor: torch.Tensor - self.batch_mlen_stage_lst: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -1207,17 +1247,17 @@ def set_lora( shape[1], ] = embeddings_tensor def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): self.indices = sampler_indices self.indices_padded = sampler_indices_padded self.indices_len = indices_len - self.seq_length_tensor = seq_length_tensor - self.b_seq_start_tensor = b_seq_start_tensor - self.batch_mlen_stage_lst = batch_mlen_stage_lst def _get_logits( self, @@ -1255,16 +1295,22 @@ def _get_logits( neginf=float("-inf"))) logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1]] = lora_logits + lora_logits.shape[1], ] = lora_logits - batch_mlen_stage_lst = self.batch_mlen_stage_lst.copy() # LogitsProcessorWithLoRA always using bgmv - batch_mlen_stage_lst[2] = False - _apply_lora_triton(hidden_states, self.lora_a_stacked, - self.lora_b_stacked, self.b_seq_start_tensor, - self.seq_length_tensor, - self.indices[:self.indices_len[1]], - batch_mlen_stage_lst, logits) + # sampler_indices + sampler_indices = self.indices_len[1] + is_prefilling = False + add_lora_triton( + logits, + hidden_states, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:sampler_indices], + 0, + 1.0, + is_prefilling, + ) # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] @@ -1274,9 +1320,13 @@ def forward(self, *args, **kwargs): return type(self.base_layer).forward(self, *args, **kwargs) @classmethod - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: # Special handling for the LogitsProcessor. return False @@ -1310,9 +1360,8 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: - scaling_factors = list( - lora_config.long_lora_scaling_factors - ) if lora_config.long_lora_scaling_factors else [] + scaling_factors = (list(lora_config.long_lora_scaling_factors) + if lora_config.long_lora_scaling_factors else []) base_scaling_factor = (self.base_layer.scaling_factor if isinstance( self.base_layer, LinearScalingRotaryEmbedding) else 1.0) scaling_factors = sorted( @@ -1340,11 +1389,14 @@ def set_lora( ... def set_mapping( - self, base_indices: torch.Tensor, sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor, - indices_len: List[int], seq_length_tensor: torch.Tensor, - b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]): + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + long_lora_indices: torch.Tensor, + indices_len: List[int], + ): self.long_lora_indices = long_lora_indices self.indices_len = indices_len @@ -1358,19 +1410,24 @@ def forward( positions, query, key, - offsets=self.long_lora_indices[:self.indices_len[4]]) + offsets=self.long_lora_indices[:self.indices_len[4]], + ) @property def scaling_factor_to_offset(self) -> Dict[float, int]: return self.base_layer.scaling_factor_to_offset @classmethod - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: List, - model_config: Optional[PretrainedConfig]) -> bool: + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: List, + model_config: Optional[PretrainedConfig], + ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - return type(source_layer) is LinearScalingRotaryEmbedding or type( - source_layer) is RotaryEmbedding + return (type(source_layer) is LinearScalingRotaryEmbedding + or type(source_layer) is RotaryEmbedding) def extra_repr(self) -> str: return self.base_layer.extra_repr() diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b6c47e599e81..f817bf65ec96 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -12,19 +12,21 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import (BaseLayerWithLoRA, - LinearScalingRotaryEmbeddingWithLora, - LoRAMapping) +from vllm.lora.layers import ( + BaseLayerWithLoRA, + LinearScalingRotaryEmbeddingWithLora, + LoRAMapping, +) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.utils import (from_layer, from_layer_logits_processor, - parse_fine_tuned_lora_name, replace_submodule) +from vllm.lora.utils import ( + from_layer, + from_layer_logits_processor, + parse_fine_tuned_lora_name, + replace_submodule, +) from vllm.utils import LRUCache, is_pin_memory_available -# NOTE: The number of _MAX_BATCHS derived from worker's model_runner. -# _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE -# is changed. - -_MAX_BATCHS = 256 + 16 #max(_BATCH_SIZES_TO_CAPTURE)+16 +from vllm.lora import punica logger = init_logger(__name__) @@ -34,6 +36,7 @@ @dataclass class LongContextLoRAContext: """Context for lora adapters that support long context.""" + # The scaling factors to support long context lora fine tuned models. scaling_factors: List[float] # dimension to apply rotary embedding. @@ -51,7 +54,7 @@ def convert_mapping( extra_vocab_size: int, long_lora_context: Optional[LongContextLoRAContext] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], List[int]]: + Optional[torch.Tensor], List[int], ]: """Converts LoRAMapping to index tensors. Args: @@ -89,7 +92,7 @@ def convert_mapping( """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() - lora_indices = mapping.batch_mapping.copy() + lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), @@ -99,27 +102,24 @@ def convert_mapping( lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping ] - token_lora_idx = None + lora_idx = None for i in range(len(index_mapping_indices)): # TODO index can be slow. optimize - token_lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) - if index_mapping_indices[i] > 0 else -1) - embedding_indices[ - i] = token_lora_idx if index_mapping_indices[i] > 0 else 0 + lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) + if index_mapping_indices[i] > 0 else -1) + embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 + lora_indices[i] = lora_idx if long_lora_context: assert long_lora_offsets is not None lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) long_lora_offsets[i] = lora_offset - # every seq lora_id - for i in range(len(lora_indices)): - lora_indices[i] = (lora_index_to_id.index(lora_indices[i]) - if lora_indices[i] > 0 else -1) indices_list: List[Union[List[int], torch.Tensor]] = [ - index_mapping_indices, embedding_indices + index_mapping_indices, + lora_indices, + embedding_indices, ] - base_indices = torch.tensor(lora_indices, dtype=torch.long, device="cuda") if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) @@ -128,33 +128,39 @@ def convert_mapping( device="cuda", dtype=torch.long) embeddings_indices = torch.stack([ - indices[1] * extra_vocab_size, - indices[1] * (vocab_size + extra_vocab_size) + indices[2] * extra_vocab_size, + indices[2] * (vocab_size + extra_vocab_size), ]) embeddings_indices[embeddings_indices == -1] = max_loras - 1 - + base_indices = indices[1] sampler_indices = prompt_mapping_tensor sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = ( - torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + - (sampler_indices_padded * len(sampler_indices_padded))) + sampler_indices_padded = torch.arange( + 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( + sampler_indices_padded * len(sampler_indices_padded)) long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: - long_lora_indices = indices[2] + long_lora_indices = indices[3] long_lora_indices_len = long_lora_indices.shape[-1] # Contain length of indices tensors. Used to index into each tensor. indices_len = [ - base_indices.shape[-1], sampler_indices.shape[-1], - sampler_indices_padded.shape[-1], embeddings_indices.shape[-1] + base_indices.shape[-1], + sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], + embeddings_indices.shape[-1], ] if long_lora_indices_len is not None: indices_len.append(long_lora_indices_len) - - return (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices, indices_len) + return ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_indices, + indices_len, + ) def get_lora_id(): @@ -202,8 +208,8 @@ def clone(self, lora_model_id: int) -> "LoRAModel": @property def extra_vocab_size(self) -> int: - return max(lora.extra_vocab_size - for lora in self.loras.values()) if self.loras else 0 + return (max(lora.extra_vocab_size + for lora in self.loras.values()) if self.loras else 0) def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: """Get LoRA for a given module by name""" @@ -244,9 +250,14 @@ def from_lora_tensors( if pin_memory: lora_embeddings_tensor = ( lora_embeddings_tensor.pin_memory()) - loras[module_name] = LoRALayerWeights(module_name, rank, - lora_alpha, None, None, - lora_embeddings_tensor) + loras[module_name] = LoRALayerWeights( + module_name, + rank, + lora_alpha, + None, + None, + lora_embeddings_tensor, + ) if is_lora_a: loras[module_name].lora_a = tensor.to(device=device, dtype=dtype).t() @@ -257,9 +268,9 @@ def from_lora_tensors( loras[module_name].lora_b = tensor.to(device=device, dtype=dtype).t() assert embedding_padding_modules is not None - if any(name in module_name - for name in embedding_padding_modules - ) and target_embedding_padding is not None: + if (any(name in module_name + for name in embedding_padding_modules) + and target_embedding_padding is not None): lora_b = loras[module_name].lora_b assert target_embedding_padding >= lora_b.shape[1] addition = target_embedding_padding - lora_b.shape[1] @@ -288,7 +299,7 @@ def from_local_checkpoint( embedding_padding_modules: Optional[List[str]] = None, ) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. - + Args: lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be @@ -411,17 +422,16 @@ def __init__( self.max_num_batched_tokens, dtype=torch.long, device="cuda") - self.long_lora_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, device="cuda") # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} - # 4 is the number of indicies tensors defined above + # 5 is the number of indicies tensors. # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices - self.indices_len: List[Optional[int]] = [None] * 4 + # embeddings_indices,prefilling or decoding + self.indices_len: List[Optional[int]] = [None] * 5 self.model: nn.Module = model if hasattr(self.model, "supported_lora_modules"): @@ -439,21 +449,7 @@ def __init__( # Dict instead of a Set for compatibility with LRUCache. self._active_loras: Dict[int, None] = {} self._last_mapping: Optional[LoRAMapping] = None - - # triton kernel mapping - self.seq_length_tensor = torch.empty(_MAX_BATCHS, - dtype=torch.long, - device="cuda") - self.b_seq_start_tensor = torch.zeros(_MAX_BATCHS, - dtype=torch.long, - device="cuda") - - # element contains batch_size, max_length, 0 or 1. Use 1 for the - # prefilling stage and 0 for the decoding stage.The reason for - # distinguishing between the prefilling and decoding stage is that - # we had implemented bgmv, it can be utilized during the decoding - # stage. - self.batch_mlen_stage_lst = [-1] * 3 + self._convert_flag = True self._create_lora_modules() self.model.lora_manager = self @@ -477,7 +473,9 @@ def activate_lora( return False first_free_slot = next( ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id) - if lora_id is None), None) + if lora_id is None), + None, + ) if first_free_slot is None: raise ValueError("No free lora slots") index, _ = first_free_slot @@ -490,8 +488,12 @@ def activate_lora( module_lora = lora_model.get_lora(module_name) if module_lora: module_lora.optimize() - module.set_lora(index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor) + module.set_lora( + index, + module_lora.lora_a, + module_lora.lora_b, + module_lora.embeddings_tensor, + ) else: module.reset_lora(index) return True @@ -518,7 +520,7 @@ def _set_long_lora_context(self, lora: LoRAModel): if lora.scaling_factor is None: return - if (lora.scaling_factor not in self.scaling_factor_to_offset): + if lora.scaling_factor not in self.scaling_factor_to_offset: raise ValueError(f"Long LoRA scaling factor {lora.scaling_factor}" " has not been initialized.") @@ -536,7 +538,11 @@ def add_lora(self, lora: LoRAModel) -> bool: logger.debug( "Adding lora. Model id: %d, " "int id: %d, " - "scaling factor: %s", lora.id, lora.id, lora.scaling_factor) + "scaling factor: %s", + lora.id, + lora.id, + lora.scaling_factor, + ) if lora.id not in self._registered_loras: if len(self._registered_loras) >= self.capacity: raise RuntimeError("No free LoRA slots.") @@ -554,12 +560,21 @@ def remove_lora(self, lora_id: int) -> bool: # TODO see if this can be vectorized def _set_lora_mapping(self, mapping: LoRAMapping) -> None: - (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_offsets_tensor, - indices_len) = convert_mapping(mapping, self.lora_index_to_id, - self.lora_slots + 1, self.vocab_size, - self.lora_config.lora_extra_vocab_size, - self.long_lora_context) + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + self.long_lora_context, + ) self.base_indices[:base_indices.shape[0]].copy_(base_indices) self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( @@ -573,25 +588,11 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: else: self.long_lora_indices.zero_() # Maintain the reference - self.indices_len[:] = indices_len - - # Mapping for sgmv kernel - if mapping.seq_lens and mapping.batch_mapping: - batchs = len(mapping.seq_lens) - seq_length_tensor = torch.tensor(mapping.seq_lens, - dtype=torch.long, - device="cuda") - self.seq_length_tensor[:batchs].copy_(seq_length_tensor) - temp_tensor = torch.cumsum(seq_length_tensor, - dim=0, - dtype=seq_length_tensor.dtype) - self.b_seq_start_tensor[1:temp_tensor.size(0) + - 1].copy_(temp_tensor) - - self.batch_mlen_stage_lst[:] = [ - batchs, - max(mapping.seq_lens), 1 if mapping.is_prefilling else 0 - ] + self.indices_len[:] = indices_len + [int(mapping.is_prefilling)] + # + if mapping.is_prefilling: + punica.reset_params_cache() + punica._compute_params(self.base_indices[:base_indices.shape[0]]) def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: @@ -619,52 +620,66 @@ def _create_lora_modules(self): parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) new_module = replace_submodule( - self.model, module_name, - from_layer(module, self.lora_slots, self.lora_config, - packed_moduled_lst, self.model.config)) + self.model, + module_name, + from_layer( + module, + self.lora_slots, + self.lora_config, + packed_moduled_lst, + self.model.config, + ), + ) # LinearScalingRotaryEmbeddingWithLora is used to handle # long context lora. Register relevant metadata. if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora): self.long_lora_context = LongContextLoRAContext( new_module.scaling_factors, new_module.rotary_dim) - self.scaling_factor_to_offset = \ - new_module.scaling_factor_to_offset + self.scaling_factor_to_offset = ( + new_module.scaling_factor_to_offset) # (yard1): TODO make this more robust if "lm_head" in module_name: logits_processor_module = self.model.get_submodule( "logits_processor") new_module = replace_submodule( - self.model, "logits_processor", - from_layer_logits_processor(logits_processor_module, - module, self.lora_slots, - self.lora_config, - self.model.config)) + self.model, + "logits_processor", + from_layer_logits_processor( + logits_processor_module, + module, + self.lora_slots, + self.lora_config, + self.model.config, + ), + ) self.register_module(module_name, new_module) self._register_packed_modules(module_name) - new_module.set_mapping(self.base_indices, self.sampler_indices, - self.sampler_indices_padded, - self.embeddings_indices, - self.long_lora_indices, self.indices_len, - self.seq_length_tensor, - self.b_seq_start_tensor, - self.batch_mlen_stage_lst) + new_module.set_mapping( + self.base_indices, + self.sampler_indices, + self.sampler_indices_padded, + self.embeddings_indices, + self.long_lora_indices, + self.indices_len, + ) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA) self.modules[module_name] = module def create_dummy_lora( - self, - lora_id: int, - rank: int, - scaling_factor: Optional[float], - embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel: + self, + lora_id: int, + rank: int, + scaling_factor: Optional[float], + embedding_modules: Optional[Dict[str, str]] = None, + ) -> LoRAModel: """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): - if not self._match_target_modules(module_name) or not isinstance( - module, BaseLayerWithLoRA) or isinstance( - module, LinearScalingRotaryEmbeddingWithLora): + if (not self._match_target_modules(module_name) + or not isinstance(module, BaseLayerWithLoRA) or isinstance( + module, LinearScalingRotaryEmbeddingWithLora)): continue parts = module_name.split(".") if module_name not in self.packed_modules: @@ -674,9 +689,9 @@ def create_dummy_lora( self.lora_config.lora_extra_vocab_size if hasattr(module.base_layer, "org_vocab_size") else module.base_layer.weight.shape[1]) - output_dim = module.base_layer.embedding_dim if hasattr( - module.base_layer, - "embedding_dim") else module.base_layer.weight.shape[0] + output_dim = (module.base_layer.embedding_dim if hasattr( + module.base_layer, "embedding_dim") else + module.base_layer.weight.shape[0]) embeddings_tensor_dim = (module.base_layer.embedding_dim if hasattr(module.base_layer, "embedding_dim") else @@ -688,7 +703,8 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim) + embeddings_tensor_dim=embeddings_tensor_dim, + ) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, @@ -722,7 +738,8 @@ def _match_target_modules(self, module_name: str): return any( re.match( r".*\.{target_module}$".format(target_module=target_module), - module_name) or target_module == module_name + module_name, + ) or target_module == module_name for target_module in self.supported_lora_modules) def _register_packed_modules(self, module_full_name: str) -> None: @@ -797,7 +814,11 @@ def add_lora(self, lora: LoRAModel) -> bool: logger.debug( "Adding lora. Model id: %d, " "int id: %d, " - "scaling factor: %s", lora.id, lora.id, lora.scaling_factor) + "scaling factor: %s", + lora.id, + lora.id, + lora.scaling_factor, + ) if lora.id not in self._registered_loras: self._add_lora(lora) was_added = True @@ -811,8 +832,8 @@ def activate_lora( self, lora_id: int, ) -> bool: - if lora_id not in self._active_loras and len( - self._active_loras) >= self.lora_slots: + if (lora_id not in self._active_loras + and len(self._active_loras) >= self.lora_slots): self._active_loras.remove_oldest() result = super().activate_lora(lora_id) # We always touch to update the LRU cache order @@ -827,13 +848,14 @@ def remove_oldest_lora(self) -> bool: def create_lora_manager( - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, - **kwargs) -> LoRAModelManager: + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, + **kwargs, +) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" if not hasattr(model, "supported_lora_modules"): raise ValueError(f"Model {type(model)} is not supported for LoRA.") @@ -843,5 +865,6 @@ def create_lora_manager( max_num_batched_tokens=max_num_batched_tokens, vocab_size=vocab_size, lora_config=lora_config, - **kwargs) + **kwargs, + ) return lora_manager diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index ba387fc2010f..7366edf81491 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -1,9 +1,7 @@ # Based on code from https://github.com/punica-ai/punica -from typing import Optional - +from typing import Optional, Dict, Tuple import torch - from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink @@ -15,12 +13,53 @@ def _raise_import_error(e): if torch.cuda.get_device_capability() < (8, 0): raise ImportError( - "punica LoRA kernels require compute capability >= 8.0") from e + "punica LoRA kernels require compute capability >= 8.0" + ) from e else: raise ImportError( "punica LoRA kernels could not be imported. If you built vLLM " "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") from e + "was set." + ) from e + + +_PARAMS_CACHE: Dict[int, Tuple] = {} + + +def _compute_params(token_lora_tensor: torch.Tensor): + pointer = token_lora_tensor.data_ptr() + if pointer not in _PARAMS_CACHE: + lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( + token_lora_tensor, return_counts=True + ) + cum_result = torch.cumsum(seq_length_tensor, dim=0) + b_seq_start_tensor = torch.zeros_like(seq_length_tensor) + b_seq_start_tensor[1:].copy_(cum_result[:-1]) + max_length = seq_length_tensor.max().item() + batch_size = lora_indices_tensor.size(0) + _PARAMS_CACHE[pointer] = ( + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + ) + return _PARAMS_CACHE[pointer] + + +def reset_params_cache(): + """At the beginning of the prefilling stage, we need clear the + cache explicitly + """ + _PARAMS_CACHE.clear() + + +def _get_prefilling_params( + token_lora_tensor: torch.Tensor, cache_clear: bool = False +): + if cache_clear: + reset_params_cache() + return _compute_params(token_lora_tensor) def bgmv( @@ -147,12 +186,13 @@ def add_lora( # We set the buffer to be float32 by default to avoid # numerical inaccuracies that would otherwise happen # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) + buffer = torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device + ) punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) - punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, - scale) + punica_kernels.dispatch_bgmv( + y, buffer, wb_t_all, indicies, layer_idx, scale + ) def add_lora_slice( @@ -200,12 +240,11 @@ def add_lora_slice( r = wb_t_all.size(-1) if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device + ) punica_kernels.dispatch_bgmv_low_level( buffer, x, @@ -230,269 +269,175 @@ def add_lora_slice( ) -def add_lora_triton( +def add_shrink_triton( y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, + w_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, - batch_size: int, - max_length: int, layer_idx: int, scale: float, is_prefilling: bool, - *, - buffer: Optional[torch.Tensor] = None, + cache_clear: bool = False, ): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - Args: - y (torch.Tensor): (batch_size, output_dim).Will be changed in-place. - x (torch.Tensor): (batch_size, hidden_dim) - wa_t_all (torch.Tensor): (num_loras, lora_rank, hidden_dim) - wb_t_all (torch.Tensor): (num_loras, output_dim, lora_rank) - b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative - sequence lengths of the sequences in the batch, used to index - into sequence. E.g.,if the sequence length is [4, 6], it is - [0, 4]. Used only during the prefilling stage. - seq_length_tensor (torch.Tensor): batch_size,). record the sequence - length of the sequences in the batch. Used only during the - prefilling stage. - lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch - batch_size (int): batch size. Used only during the prefilling stage. - max_length (int): maximum seq length in the batch.Used only during the - prefilling stage. - layer_idx (int): Layer index of LoRA weights. - scale (float): Scaling factor. - is_prefilling (bool): True indicates the prefilling stage, while False - indicates the decoding stage." - buffer (Optional[torch.Tensor], optional): (batch_size,rank) - """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) if is_prefilling: - _lora_sgmv( - y, + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefilling_params(lora_indices_tensor, cache_clear) + sgmv_shrink( x, - wa_t_all, - wb_t_all, + w_t_all, + y, b_seq_start_tensor, seq_length_tensor, - lora_indices_tensor, + last_lora_indices_tensor, batch_size, max_length, - layer_idx, scale, - buffer=buffer, ) else: - _lora_bgmv( - y, - x, - wa_t_all, - wb_t_all, - lora_indices_tensor, - layer_idx, - scale, - buffer=buffer, - ) + bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) -def _lora_sgmv( +def add_expand_triton( y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - batch_size: int, - max_length: int, - layer_idx: int, - scale: float, - buffer: torch.Tensor, -): - sgmv_shrink( - x, - wa_t_all, - buffer, - b_seq_start_tensor, - seq_length_tensor, - lora_indices_tensor, - batch_size, - max_length, - scale, - ) - sgmv_expand( - buffer, - wb_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - lora_indices_tensor, - batch_size, - max_length, - add_inputs=True, - ) - - -def _lora_bgmv( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, + w_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, layer_idx: int, - scale: float, - buffer: torch.Tensor, + is_prefilling: bool, + add_input: bool = True, + cache_clear: bool = False, ): - bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale) - bgmv_expand(buffer, wb_t_all, y, lora_indices_tensor, add_inputs=True) + if is_prefilling: + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefilling_params(lora_indices_tensor, cache_clear) + sgmv_expand( + x, + w_t_all, + y, + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + add_input, + ) + else: + bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input) -def add_lora_triton_slice( +def add_expand_slice_triton( y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, + w_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, - batch_size: int, - max_length: int, layer_idx: int, - scale: float, + is_prefilling: bool, y_offset: int, y_slice_size: int, - is_prefilling: bool, - *, - buffer: Optional[torch.Tensor] = None, + add_input: bool = True, + cache_clear: bool = False, ): - """ - Same as `add_lora_triton` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - """ - # try: - # import vllm._punica_C as punica_kernels - # except ImportError as e: - # _raise_import_error(e) - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) if is_prefilling: - _lora_sgmv_nslice( - y, + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefilling_params(lora_indices_tensor, cache_clear) + sgmv_expand_slice( x, - wa_t_all, - wb_t_all, + w_t_all, + y, b_seq_start_tensor, seq_length_tensor, - lora_indices_tensor, + last_lora_indices_tensor, batch_size, max_length, - layer_idx, - scale, y_offset, y_slice_size, - buffer, + add_input, ) else: - _lora_bgmv_nslice( - y, + bgmv_expand_slice( x, - wa_t_all, - wb_t_all, + w_t_all, + y, lora_indices_tensor, - layer_idx, - scale, y_offset, y_slice_size, - buffer, + add_inputs=add_input, ) -def _lora_sgmv_nslice( +def add_lora_triton( y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, - b_seq_start_tensor: torch.Tensor, - seq_length_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batch_size: int, - max_length: int, layer_idx: int, scale: float, - y_offset: int, - y_slice_size: int, - buffer, + is_prefilling: bool, + y_offset: Optional[int] = None, + y_slice_size: Optional[int] = None, + *, + buffer: Optional[torch.Tensor] = None, + cache_clear: bool = False, ): - sgmv_shrink( + """ + Same as `add_lora_triton` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + """ + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device + ) + + add_shrink_triton( + buffer, x, wa_t_all, - buffer, - b_seq_start_tensor, - seq_length_tensor, lora_indices_tensor, - batch_size, - max_length, + 0, scale, + is_prefilling, + cache_clear=cache_clear, ) - sgmv_expand_slice( - buffer, - wb_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - lora_indices_tensor, - batch_size, - max_length, - y_offset, - y_slice_size, - add_inputs=True, - ) - - -def _lora_bgmv_nslice( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - buffer, -): - bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale) - bgmv_expand_slice( - buffer, - wb_t_all, - y, - lora_indices_tensor, - y_offset, - y_slice_size, - add_inputs=True, - ) + if y_offset is None and y_slice_size is None: + add_expand_triton( + y, + buffer, + wb_t_all, + lora_indices_tensor, + 0, + is_prefilling, + add_input=True, + cache_clear=cache_clear, + ) + else: + add_expand_slice_triton( + y, + buffer, + wb_t_all, + lora_indices_tensor, + 0, + is_prefilling, + y_offset, + y_slice_size, + add_input=True, + cache_clear=cache_clear, + ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 476e9ba3bb46..3a5bc6c78515 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,7 +1,5 @@ -import gc import time import warnings -from collections import defaultdict from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union import numpy as np @@ -13,17 +11,16 @@ ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.parallel_state import graph_capture +from vllm.distributed.communication_op import graph_capture from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sampling_params import SamplingParams -from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData, + SequenceGroupMetadata) from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip, is_pin_memory_available, make_tensor_with_pad) @@ -37,7 +34,6 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) ] -_NUM_WARMUP_ITERS = 2 class ModelInput(NamedTuple): @@ -48,7 +44,7 @@ class ModelInput(NamedTuple): query_lens: List[int] lora_mapping: Optional[LoRAMapping] lora_requests: Set[LoRARequest] - multi_modal_kwargs: Dict[str, torch.Tensor] + multi_modal_input: Optional[torch.Tensor] slot_mapping: torch.Tensor num_prefill_tokens: int num_decode_tokens: int @@ -64,7 +60,7 @@ def empty(cls, device): query_lens=[], lora_mapping=None, lora_requests=set(), - multi_modal_kwargs={}, + multi_modal_input=None, slot_mapping=torch.empty(0, device=device), num_prefill_tokens=0, num_decode_tokens=0, @@ -126,16 +122,6 @@ def __init__( self.block_size, ) - # Create processor for multi-modal data - if self.vision_language_config is not None: - self.multi_modal_input_processor = MULTIMODAL_REGISTRY \ - .create_input_processor( - self.model_config, - self.vision_language_config, - ) - else: - self.multi_modal_input_processor = None - # Lazy initialization self.model: nn.Module # Set after load_model # Set if the backend is flashinfer. @@ -223,16 +209,6 @@ def save_sharded_state( max_size=max_size, ) - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - from vllm.model_executor.model_loader.loader import TensorizerLoader - TensorizerLoader.save_model( - self.model, - tensorizer_config=tensorizer_config, - ) - def get_max_block_per_batch(self) -> int: block_size = self.block_size return (self.max_seq_len_to_capture + block_size - 1) // block_size @@ -266,8 +242,7 @@ def _prepare_model_input( context_lens: List[int] = [] query_lens: List[int] = [] block_tables: List[List[int]] = [] - multi_modal_kwargs_list: Dict[str, - List[torch.Tensor]] = defaultdict(list) + multi_modal_input_list: List[torch.Tensor] = [] decode_only = True num_prefills = 0 num_prefill_tokens = 0 @@ -294,12 +269,6 @@ def _prepare_model_input( if len(seq_group_metadata_list) == 0: return ModelInput.empty(self.device) - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window + self.block_size - - 1) // self.block_size - block_aligned_sliding_window = \ - sliding_window_blocks * self.block_size - for seq_group_metadata in seq_group_metadata_list: seq_ids = list(seq_group_metadata.seq_data.keys()) is_prompt = seq_group_metadata.is_prompt @@ -340,30 +309,6 @@ def _prepare_model_input( and self.sliding_window is None and is_prompt) - # These are seq_len/context_len capped to the sliding window. - # They are passed to decode kernel. - # We still need original seq_len/context_len to compute slot - # mapping (and input position) below. - curr_sliding_window_blocks = None - sliding_seq_len = seq_len - sliding_context_len = context_len - - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - if (self.sliding_window is not None and not is_prompt): - curr_sliding_window_blocks = sliding_window_blocks - if self.scheduler_config.use_v2_block_manager: - # number of elements in last block - suff_len = seq_len % self.block_size - sliding_seq_len = min( - seq_len, block_aligned_sliding_window + suff_len) - if suff_len > 0: - curr_sliding_window_blocks += 1 - else: - sliding_seq_len = min(seq_len, self.sliding_window) - sliding_context_len = sliding_seq_len - 1 - # TODO(sang): Combine chunked prefill and prefix caching by # only allowing multiple of block_size chunk size. # NOTE: This only works for oooooooxxx style attention. @@ -371,13 +316,6 @@ def _prepare_model_input( assert computed_block_nums is not None context_len = len(computed_block_nums) * self.block_size tokens = tokens[context_len:] - - # need to think what to set it to when we have both sliding - # window and prefix caching... - assert self.sliding_window is None, \ - "Prefix caching is not supported with sliding window" - sliding_context_len = context_len - if self.attn_backend.get_name() == "flash-attn": # NOTE(woosuk): For flash-attn, the block table should # include the entries for the incoming prefill tokens. @@ -391,9 +329,14 @@ def _prepare_model_input( if seq_group_metadata.block_tables is not None: # chunked prefill or decode block_table = seq_group_metadata.block_tables[seq_id] - if curr_sliding_window_blocks is not None: - block_table = block_table[ - -curr_sliding_window_blocks:] + if self.sliding_window is not None: + # chunked prefill doesn't support sliding window. + assert (not self.scheduler_config. + chunked_prefill_enabled) + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + if self.attn_backend.get_name() == "flashinfer": paged_kv_indices.extend(block_table) paged_kv_indptr.append(paged_kv_indptr[-1] + @@ -411,9 +354,16 @@ def _prepare_model_input( block_table = [] block_tables.append(block_table) - seq_lens.append(sliding_seq_len) - context_lens.append(sliding_context_len) - query_len = sliding_seq_len - sliding_context_len + # TODO(sang): This is a hack to make sliding window work with + # paged attn. We can remove it if we make paged attn kernel + # to properly handle slinding window attn. + if (self.sliding_window is not None and not is_prompt): + seq_len = min(seq_len, self.sliding_window) + context_len = seq_len - 1 + + seq_lens.append(seq_len) + context_lens.append(context_len) + query_len = seq_len - context_len query_lens.append(query_len) input_tokens.extend(tokens) input_positions.extend(list(range(context_len, seq_len))) @@ -430,7 +380,7 @@ def _prepare_model_input( "seq_len: {}, context_len: {}, query_len: {}".format( seq_len, context_len, query_len)) num_decode_tokens += query_len - decode_seq_lens.append(sliding_seq_len) + decode_seq_lens.append(seq_len) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) @@ -438,21 +388,14 @@ def _prepare_model_input( lora_index_mapping += [lora_id] * query_len lora_prompt_mapping.extend( [lora_id] * - (query_len if seq_group_metadata.sampling_params + (seq_len - + context_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs - is not None else 1)) + else 1)) - mm_data = seq_group_metadata.multi_modal_data - if mm_data is not None: - # Process multi-modal data - if self.multi_modal_input_processor is None: - raise ValueError( - "Multi-modal inputs are only supported by " - "vision language models.") - - mm_kwargs = self.multi_modal_input_processor(mm_data) - for k, v in mm_kwargs.items(): - multi_modal_kwargs_list[k].append(v) + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) if _is_block_tables_empty(seq_group_metadata.block_tables): # During memory profiling, the block tables are not @@ -474,10 +417,9 @@ def _prepare_model_input( start_idx = 0 if self.sliding_window is not None: if is_prompt: - assert self.scheduler_config.use_v2_block_manager \ - or context_len == 0, ( + assert context_len == 0, ( "Prefix caching is currently not supported with " - "sliding window attention in V1 block manager") + "sliding window attention") # It is an optimization. When it is decoding, it is always # 0. When prefill, we use it to not write slots to kv cache # to save memory. @@ -538,6 +480,29 @@ def _prepare_model_input( ) assert max_query_len > 0, ("query_lens: {}".format(query_lens)) + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) + query_lens_tensor = torch.tensor(query_lens, + dtype=torch.long, + device=self.device) + query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int, device=self.device) @@ -545,6 +510,11 @@ def _prepare_model_input( dtype=torch.int32, device=self.device) + torch.cumsum(query_lens_tensor, + dim=0, + dtype=query_start_loc.dtype, + out=query_start_loc[1:]) + torch.cumsum(seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, @@ -597,21 +567,6 @@ def _prepare_model_input( seq_start_loc=seq_start_loc, data_type=kv_cache_dtype) else: - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - query_lens_tensor = torch.tensor(query_lens, - dtype=torch.long, - device=self.device) - query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) - - torch.cumsum(query_lens_tensor, - dim=0, - dtype=query_start_loc.dtype, - out=query_start_loc[1:]) - attn_metadata = self.attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=slot_mapping_tensor, @@ -630,18 +585,11 @@ def _prepare_model_input( ) if self.lora_config: - lora_mapping = LoRAMapping( - lora_index_mapping, - lora_prompt_mapping, - ) + lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping, + bool(attn_metadata.prefill_metadata)) else: lora_mapping = None - multi_modal_kwargs = { - k: torch.cat(v, dim=0).to(self.device) - for k, v in multi_modal_kwargs_list.items() - } - return ModelInput( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, @@ -650,7 +598,7 @@ def _prepare_model_input( query_lens=query_lens, lora_mapping=lora_mapping, lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, + multi_modal_input=multi_modal_input, slot_mapping=slot_mapping_tensor, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -661,7 +609,7 @@ def prepare_input_tensors( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: + Set[LoRARequest], LoRAMapping, torch.Tensor]: if self.is_driver_worker: assert seq_group_metadata_list is not None # Prepare input tensors. @@ -673,7 +621,7 @@ def prepare_input_tensors( query_lens, lora_mapping, lora_requests, - multi_modal_kwargs, + multi_modal_input, slot_mapping, num_prefill_tokens, num_decode_tokens, @@ -690,7 +638,7 @@ def prepare_input_tensors( sampling_metadata.selected_token_indices, "lora_requests": lora_requests, "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, + "multi_modal_input": multi_modal_input, "num_prefill_tokens": num_prefill_tokens, "num_decode_tokens": num_decode_tokens, "slot_mapping": slot_mapping, @@ -707,7 +655,7 @@ def prepare_input_tensors( "selected_token_indices") lora_mapping = metadata_dict.pop("lora_mapping") lora_requests = metadata_dict.pop("lora_requests") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") + multi_modal_input = metadata_dict.pop("multi_modal_input") if metadata_dict: attn_metadata = self.attn_backend.make_metadata( **metadata_dict) @@ -722,7 +670,7 @@ def prepare_input_tensors( return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, - multi_modal_kwargs) + multi_modal_input) @torch.inference_mode() def execute_model( @@ -731,7 +679,7 @@ def execute_model( kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_kwargs + lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) if self.lora_config: @@ -745,14 +693,15 @@ def execute_model( model_executable = self.graph_runners[graph_batch_size] else: model_executable = self.model - - hidden_states = model_executable( - input_ids=input_tokens, - positions=input_positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - **multi_modal_kwargs, - ) + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) + hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) @@ -808,24 +757,16 @@ def profile_run(self) -> None: # To exercise the worst scenario for GPU memory consumption, # the number of seqs (batch_size) is chosen to maximize the number # of images processed. - model_config = self.model_config - vlm_config = self.vision_language_config - - if vlm_config: + if self.vision_language_config: max_num_seqs = min( max_num_seqs, - int(max_num_batched_tokens / vlm_config.image_feature_size)) + int(max_num_batched_tokens / + self.vision_language_config.image_feature_size)) for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - - if vlm_config is None: - seq_data = SequenceData([0] * seq_len) - dummy_multi_modal_data = None - else: - seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \ - .dummy_data_for_profiling(seq_len, model_config, vlm_config) - + seq_data, fake_multi_modal_input = _prepare_fake_inputs( + seq_len, self.vision_language_config) seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, @@ -834,7 +775,7 @@ def profile_run(self) -> None: block_tables=None, lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_multi_modal_data, + multi_modal_data=fake_multi_modal_input, ) seqs.append(seq) @@ -906,10 +847,6 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() block_tables = torch.from_numpy(self.graph_block_tables).cuda() - # Prepare buffer for outputs. These will be reused for all batch sizes. - # It will be filled after the first graph capture. - hidden_states: Optional[torch.Tensor] = None - graph_batch_size = _get_graph_batch_size( self.scheduler_config.max_num_seqs) batch_size_capture_list = [ @@ -939,18 +876,14 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: ) if self.lora_config: - lora_mapping = LoRAMapping( - [0] * batch_size, - [0] * batch_size, - ) + lora_mapping = LoRAMapping([0] * batch_size, + [0] * batch_size, False) self.set_active_loras(set(), lora_mapping) graph_runner = CUDAGraphRunner(self.model) - hidden_states = graph_runner.capture( + graph_runner.capture( input_tokens[:batch_size], input_positions[:batch_size], - hidden_states[:batch_size] - if hidden_states is not None else None, kv_caches, attn_metadata, memory_pool=self.graph_memory_pool, @@ -987,46 +920,35 @@ def capture( self, input_ids: torch.Tensor, positions: torch.Tensor, - hidden_states: Optional[torch.Tensor], kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, memory_pool: Optional[Tuple[int, int]], stream: torch.cuda.Stream, **kwargs, - ) -> torch.Tensor: + ) -> None: assert self._graph is None - # Run the model a few times without capturing the graph. + # Run the model once without capturing the graph. # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). - # Note one iteration is not enough for torch.jit.script - for _ in range(_NUM_WARMUP_ITERS): - self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - **kwargs, - ) + self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + **kwargs, + ) torch.cuda.synchronize() # Capture the graph. self._graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): - output_hidden_states = self.model( + hidden_states = self.model( input_ids, positions, kv_caches, attn_metadata, **kwargs, ) - if hidden_states is not None: - hidden_states.copy_(output_hidden_states) - else: - hidden_states = output_hidden_states - del output_hidden_states - # make sure `output_hidden_states` is deleted - # in the graph's memory pool - gc.collect() torch.cuda.synchronize() # Save the input and output buffers. @@ -1039,7 +961,7 @@ def capture( "block_tables": attn_metadata.decode_metadata.block_tables, } self.output_buffers = {"hidden_states": hidden_states} - return hidden_states + return def forward( self, @@ -1086,6 +1008,24 @@ def _get_graph_batch_size(batch_size: int) -> int: _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) +def _prepare_fake_inputs( + seq_len: int, vision_language_config: Optional[VisionLanguageConfig]): + """Prepare fake inputs for profile run.""" + if vision_language_config: + prompt_tokens = [ + vision_language_config.image_token_id + ] * vision_language_config.image_feature_size + [0] * ( + seq_len - vision_language_config.image_feature_size) + fake_image_input = MultiModalData( + type=MultiModalData.Type.IMAGE, + data=torch.zeros(vision_language_config.image_input_shape, + dtype=torch.float16)) + else: + prompt_tokens = [0] * seq_len + fake_image_input = None + return SequenceData(prompt_tokens), fake_image_input + + def _is_block_tables_empty(block_tables: Union[None, Dict]): """ Check if block_tables is None or a dictionary with all None values. From 4c5889e9664dac5899217cd1e9abb72be06f422a Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 18 Jun 2024 11:45:06 +0800 Subject: [PATCH 26/71] delete punica --- tests/lora/test_punica.py | 234 ----------------------------- vllm/lora/fully_sharded_layers.py | 1 - vllm/lora/punica.py | 235 +----------------------------- 3 files changed, 6 insertions(+), 464 deletions(-) delete mode 100644 tests/lora/test_punica.py diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py deleted file mode 100644 index f021c003b132..000000000000 --- a/tests/lora/test_punica.py +++ /dev/null @@ -1,234 +0,0 @@ -# Based on code from https://github.com/punica-ai/punica - -import pytest -import torch - -import vllm.lora.punica as punica - - -def assert_close(a, b): - rtol, atol = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), - torch.float32: (None, None), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - - -def _lora_ref_impl( - y_final: torch.Tensor, - x: torch.Tensor, - wa_T_all: torch.Tensor, - wb_T_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, -): - y_stage_1 = torch.empty( - (x.size(0), wa_T_all.size(-2)), - dtype=torch.float32, - device=x.device, - ) - bs = x.shape[0] - s = torch.tensor(scale, dtype=torch.float32, device=x.device) - for i, lora_idx in zip(range(bs), indicies.cpu().tolist()): - xi = x[i].unsqueeze(0).to(torch.float32) - wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) - if wb_T_all is not None: - wb = wb_T_all[lora_idx, layer_idx].transpose(-1, - -2).to(torch.float32) - - tmp = xi @ wa - y_stage_1[i] = tmp.squeeze(0) - y_final[i] += ((tmp @ wb).squeeze(0) * - s if wb_T_all is not None else y_stage_1[i]) - return y_final, y_stage_1 - - -H1 = H2 = [ - 128, - 256, - 512, - 1024, - 1152, - 1280, - 1536, - 2048, - 2304, - 2560, - 2752, - 3072, - 3328, - 3456, - 3584, - 4096, - 4608, - 5120, - 5504, - 5632, - 6144, - 6400, - 6848, - 6912, - 7168, - 8192, - 9216, - 10240, - 11008, - 13824, - 14336, - 15360, - 22016, - 24576, - 27392, - 27648, - 32000, - 32256, - 32512, - 32768, - 33024, - 36864, - 43264, - 49152, - 64000, - 64256, - 102400, - 102656, - 128000, - 128256, -] -H2 = [64] + H2 -R = [1, 2, 4] -SEED = [0xabcdabcd987] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("r", R) -@pytest.mark.parametrize("seed", SEED) -@torch.inference_mode() -def test_lora_a_extra_shapes(dtype_str, h1, r, seed): - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - bs = 32 - dtype = getattr(torch, dtype_str) - device = torch.device("cuda") - - wa_T_all = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype, device=device) - y = torch.randn(bs, r, dtype=dtype, device=device) - - y_ref = y.clone() - _lora_ref_impl( - y_ref, - x, - wa_T_all, - None, - indices, - layer_idx, - 1.0, - ) - - y_our = y.clone() - punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0) - - assert_close(y_ref, y_our) - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_lora_correctness(dtype_str, h1, h2, seed, device): - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - torch.set_default_device(device) - - wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype) - indices = torch.randint(num_loras, (bs, ), dtype=torch.long) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype) - y = torch.randn(bs, h2, dtype=dtype) - - y_ref = y.clone() - _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale) - - y_our = y.clone() - punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx, - scale) - - assert_close(y_ref, y_our) - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_lora_correctness_slice(dtype_str, h1, h2, seed, device): - if h2 % 3 != 0 or h2 // 3 not in H1: - pytest.skip("h2 must be divisible by 3 and in supported shapes") - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - torch.set_default_device(device) - - wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - - indices = torch.randint(num_loras, (bs, ), dtype=torch.long) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype) - y = torch.randn(bs, h2, dtype=dtype) - s = h2 // 3 - - y_ref = y.clone() - _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale) - - y_our = y.clone() - punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale, 0, s) - punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale, s, s) - punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale, s * 2, s) - - assert_close(y_ref[:, :s], y_our[:, :s]) - assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2]) - assert_close(y_ref[:, s * 2:], y_our[:, s * 2:]) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index fbea667a215e..e405d06ef695 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -17,7 +17,6 @@ MergedQKVParallelLinearWithLora, RowParallelLinearWithLoRA, ) -from vllm.lora.punica import bgmv, dispatch_bgmv_low_level from vllm.lora.punica import ( add_shrink_triton, add_expand_triton, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 7366edf81491..ec4366acf456 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -9,20 +9,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink - -def _raise_import_error(e): - if torch.cuda.get_device_capability() < (8, 0): - raise ImportError( - "punica LoRA kernels require compute capability >= 8.0" - ) from e - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set." - ) from e - - _PARAMS_CACHE: Dict[int, Tuple] = {} @@ -30,8 +16,7 @@ def _compute_params(token_lora_tensor: torch.Tensor): pointer = token_lora_tensor.data_ptr() if pointer not in _PARAMS_CACHE: lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( - token_lora_tensor, return_counts=True - ) + token_lora_tensor, return_counts=True) cum_result = torch.cumsum(seq_length_tensor, dim=0) b_seq_start_tensor = torch.zeros_like(seq_length_tensor) b_seq_start_tensor[1:].copy_(cum_result[:-1]) @@ -54,221 +39,13 @@ def reset_params_cache(): _PARAMS_CACHE.clear() -def _get_prefilling_params( - token_lora_tensor: torch.Tensor, cache_clear: bool = False -): +def _get_prefilling_params(token_lora_tensor: torch.Tensor, + cache_clear: bool = False): if cache_clear: reset_params_cache() return _compute_params(token_lora_tensor) -def bgmv( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight - matrices. - indicies: Shape: `[B]`. Indices of the weight matrices. - layer_idx: Layer index of the weight matrices. - scale: Scaling factor. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) - - -def dispatch_bgmv_low_level( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, -): - """ - Same as `bgmv` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of - all of the transposed LoRA matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - punica_kernels.dispatch_bgmv_low_level( - y, - x, - w_t_all, - indicies, - layer_idx, - scale, - x.size(1), - y_slice_size, - y_offset, - ) - - -def add_lora( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None, -): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - buffer: Optional. Shape: `[B, R]`. Temporary buffer. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros( - (x.size(0), r), dtype=torch.float32, device=x.device - ) - punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) - punica_kernels.dispatch_bgmv( - y, buffer, wb_t_all, indicies, layer_idx, scale - ) - - -def add_lora_slice( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None, -): - """ - Same as `add_lora` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - try: - import vllm._punica_C as punica_kernels - except ImportError as e: - _raise_import_error(e) - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros( - (x.size(0), r), dtype=torch.float32, device=x.device - ) - punica_kernels.dispatch_bgmv_low_level( - buffer, - x, - wa_t_all, - indicies, - layer_idx, - 1.0, - x.size(1), - buffer.size(1), - 0, - ) - punica_kernels.dispatch_bgmv_low_level( - y, - buffer, - wb_t_all, - indicies, - layer_idx, - scale, - buffer.size(1), - y_slice_size, - y_offset, - ) - - def add_shrink_triton( y: torch.Tensor, x: torch.Tensor, @@ -403,9 +180,9 @@ def add_lora_triton( if buffer is None: # We set the buffer to be float32 by default ,refer to: # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros( - (x.size(0), r), dtype=torch.float32, device=x.device - ) + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) add_shrink_triton( buffer, From 82560db571fb68e72661229fe37281f5a666aa3e Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 19 Jun 2024 13:38:19 +0800 Subject: [PATCH 27/71] fix bug --- tests/lora/test_triton_punica.py | 4 ++-- vllm/lora/layers.py | 6 +++--- vllm/lora/models.py | 15 +++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index a098aba16456..9aa210db7073 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -2,7 +2,6 @@ import torch import vllm._punica_C as punica_kernels -import vllm.lora.punica as punica from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink @@ -90,7 +89,8 @@ def assert_close(a, b): @torch.inference_mode() def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling): layer_idx = 0 - punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling) + punica_kernels.dispatch_bgmv(out_tensor, inputs, lora_weights, indices, + layer_idx, scaling) return diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 1dd89df3c4f6..abab47f34fdc 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -85,7 +85,7 @@ def _apply_expand_triton( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) token_num = indices_info[0] - is_prefilling = bool(indices_info[4]) + is_prefilling = bool(indices_info[5]) add_expand_triton( output, x, @@ -131,7 +131,7 @@ def _apply_lora_triton( output = output.view(-1, output.shape[-1]) token_num = indices_info[0] - is_prefilling = bool(indices_info[4]) + is_prefilling = bool(indices_info[5]) add_lora_triton( output, x, @@ -173,7 +173,7 @@ def _apply_lora_triton_nslice( output = output.view(-1, output.shape[-1]) token_num = indices_info[0] - is_prefilling = bool(indices_info[4]) + is_prefilling = bool(indices_info[5]) offset_left = 0 # TODO fuse these kernels for slice_idx in range(len(output_slices)): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index f817bf65ec96..4cb977a25de1 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -87,8 +87,7 @@ def convert_mapping( indices_len: List of lengths of the above tensors. Used to index into each tensor. It contains length for (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices). If long_lora doesn't - exist, it only contains first 4 entries. + embeddings_indices, long_lora_indices,prefilling stage flag). """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() @@ -153,6 +152,10 @@ def convert_mapping( ] if long_lora_indices_len is not None: indices_len.append(long_lora_indices_len) + else: + #If long_lora doesn'texist,append None + indices_len.append(None) + indices_len.append(int(mapping.is_prefilling)) return ( base_indices, sampler_indices, @@ -428,10 +431,10 @@ def __init__( # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} - # 5 is the number of indicies tensors. + # 6 is the number of indicies tensors. # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,prefilling or decoding - self.indices_len: List[Optional[int]] = [None] * 5 + # embeddings_indices,long_lora_indices,prefilling or decoding + self.indices_len: List[Optional[int]] = [None] * 6 self.model: nn.Module = model if hasattr(self.model, "supported_lora_modules"): @@ -588,7 +591,7 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: else: self.long_lora_indices.zero_() # Maintain the reference - self.indices_len[:] = indices_len + [int(mapping.is_prefilling)] + self.indices_len[:] = indices_len # if mapping.is_prefilling: punica.reset_params_cache() From e3ba5a5ae3cef0106b5420d9b670f0c5b39e06a4 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 20 Jun 2024 13:41:57 +0800 Subject: [PATCH 28/71] fix unit test --- tests/lora/test_lora.py | 93 ++++++++++++----- tests/lora/test_triton_punica.py | 149 +++++++++++++-------------- vllm/lora/fully_sharded_layers.py | 82 +++------------ vllm/lora/layers.py | 122 ++++++++++------------ vllm/lora/models.py | 10 +- vllm/lora/ops/bgmv_expand.py | 6 +- vllm/lora/ops/bgmv_expand_slice.py | 4 +- vllm/lora/ops/bgmv_shrink.py | 2 + vllm/lora/ops/sgmv_expand.py | 4 +- vllm/lora/ops/sgmv_expand_slice.py | 3 +- vllm/lora/ops/utils.py | 4 +- vllm/lora/punica.py | 57 ++++++++-- vllm/model_executor/layers/linear.py | 1 + 13 files changed, 279 insertions(+), 258 deletions(-) diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 3415d36b7e34..4bc959b826bb 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -20,13 +20,16 @@ torch.bfloat16: (3e-2, 2e-2), } +STAGES = [0, 1] #prefilling(1) or decoding(0) + @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @pytest.mark.parametrize("rank", RANKS) @pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora(m, n, k, rank, dtype) -> None: +@pytest.mark.parametrize("stage", STAGES) +def test_apply_lora(m, n, k, rank, dtype, stage) -> None: manager = DummyLoRAManager() module_name = "module" @@ -53,19 +56,31 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: for i in range(lora_a_stack.shape[0]): lora_a_stack[i][0] = lora.lora_a.T lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T - + indices_info = [None] * 6 + indices_info[0] = k + indices_info[5] = stage output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora( - input, lora_a_stack, lora_b_stack, - torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"), - output) + _apply_lora(input, + lora_a_stack, + lora_b_stack, + torch.randint(0, + lora_a_stack.shape[0], (len(input), ), + device="cuda"), + indices_info, + output, + cache_clear=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.full((len(input), ), -1, device="cuda"), output) + _apply_lora(input, + lora_a_stack, + lora_b_stack, + torch.full((len(input), ), -1, device="cuda"), + indices_info, + output, + cache_clear=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -76,7 +91,8 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: @pytest.mark.parametrize("k", BATCH_SIZES) @pytest.mark.parametrize("rank", RANKS) @pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: +@pytest.mark.parametrize("stage", STAGES) +def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None: if m % 2 != 0: pytest.skip("m must be divisible by 2") if m // 2 not in TENSOR_SIZES: @@ -91,7 +107,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: lora_1 = manager.get_module_lora(module_name + "1") manager.init_random_lora(module_name + "2", weight, rank=rank) lora_2 = manager.get_module_lora(module_name + "2") - + input = torch.rand(k, n, device="cuda", dtype=dtype) expected = torch.cat([ input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, @@ -120,21 +136,32 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T lora_a_stacks[1][i][0] = lora_2.lora_a.T lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T - + indices_info = [None] * 6 + indices_info[0] = k + indices_info[5] = stage output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="cuda"), output, (m // 2, m // 2)) + _apply_lora_packed_nslice(input, + lora_a_stacks, + lora_b_stacks, + torch.randint(0, + lora_a_stacks[0].shape[0], + (len(input), ), + device="cuda"), + indices_info, + output, (m // 2, m // 2), + cache_clear=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + _apply_lora_packed_nslice(input, + lora_a_stacks, + lora_b_stacks, torch.full((len(input), ), -1, device="cuda"), - output, (m // 2, m // 2)) + indices_info, + output, (m // 2, m // 2), + cache_clear=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -145,7 +172,8 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: @pytest.mark.parametrize("k", BATCH_SIZES) @pytest.mark.parametrize("rank", RANKS) @pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: +@pytest.mark.parametrize("stage", STAGES) +def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None: manager = DummyLoRAManager() module_name = "module" @@ -204,21 +232,32 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T lora_a_stacks[2][i][0] = lora_v.lora_a.T lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T - + indices_info = [None] * 6 + indices_info[0] = k + indices_info[5] = stage #decoding stage output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="cuda"), output, (qkv[0], qkv[1], qkv[2])) + _apply_lora_packed_nslice(input, + lora_a_stacks, + lora_b_stacks, + torch.randint(0, + lora_a_stacks[0].shape[0], + (len(input), ), + device="cuda"), + indices_info, + output, (qkv[0], qkv[1], qkv[2]), + cache_clear=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + _apply_lora_packed_nslice(input, + lora_a_stacks, + lora_b_stacks, torch.full((len(input), ), -1, device="cuda"), - output, (qkv[0], qkv[1], qkv[2])) + indices_info, + output, (qkv[0], qkv[1], qkv[2]), + cache_clear=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 9aa210db7073..d4281004a7a2 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -324,82 +324,81 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, # ref_out_tensor = ref_out_tensor.to(torch.float32) # assert_close(our_out_tensor, ref_out_tensor) +# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +# @pytest.mark.parametrize("scaling", SCALES) +# @pytest.mark.parametrize("dtype", DTYPES) +# @pytest.mark.parametrize("op_type", OP_TYPES) +# @pytest.mark.parametrize("seed", SEED) +# @pytest.mark.parametrize("device", CUDA_DEVICES) +# def test_triton_sgmv_punica_bgmv( +# hidden_size, +# scaling: float, +# dtype: torch.dtype, +# op_type: str, +# seed: int, +# device: str, +# ): +# # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error +# if dtype == torch.float32 or hidden_size == 3424: +# return +# torch.manual_seed(seed) +# torch.set_default_device(device) +# batchs = 4 # Arbitrary values for testing +# rank = 16 # Arbitrary values for testing +# seq_len = 128 # Arbitrary values for testing +# num_loras = 8 # Arbitrary values for testing +# ( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# ref_out_tensor, +# b_seq_start_loc, +# lora_indices_tensor, +# seq_len_tensor, +# indices, +# ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, +# op_type, device) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_triton_sgmv_punica_bgmv( - hidden_size, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) - torch.set_default_device(device) - batchs = 4 # Arbitrary values for testing - rank = 16 # Arbitrary values for testing - seq_len = 128 # Arbitrary values for testing - num_loras = 8 # Arbitrary values for testing - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, - op_type, device) - - max_seq_length = seq_len_tensor.max() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - scaling, - ) - else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batchs, - max_seq_length, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( - ref_out_tensor, - inputs_tensor, - lora_weights_4d, - indices, - scaling if op_type == "shrink" else 1.0, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) +# max_seq_length = seq_len_tensor.max() +# if isinstance(max_seq_length, tuple): +# max_seq_length = max_seq_length[0].item() +# else: +# max_seq_length = max_seq_length.item() +# if op_type == "shrink": +# sgmv_shrink( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# b_seq_start_loc, +# seq_len_tensor, +# lora_indices_tensor, +# batchs, +# max_seq_length, +# scaling, +# ) +# else: +# sgmv_expand( +# inputs_tensor, +# lora_weights, +# our_out_tensor, +# b_seq_start_loc, +# seq_len_tensor, +# lora_indices_tensor, +# batchs, +# max_seq_length, +# add_inputs=True, +# ) +# lora_weights_4d = lora_weights.unsqueeze(dim=1) +# _punica_bgmv( +# ref_out_tensor, +# inputs_tensor, +# lora_weights_4d, +# indices, +# scaling if op_type == "shrink" else 1.0, +# ) +# if op_type == "shrink": +# ref_out_tensor = ref_out_tensor.to(torch.float32) +# assert_close(our_out_tensor, ref_out_tensor) @pytest.mark.parametrize("batchs", BATCHS) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index e405d06ef695..76544e1d51ca 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -18,9 +18,9 @@ RowParallelLinearWithLoRA, ) from vllm.lora.punica import ( - add_shrink_triton, - add_expand_triton, - add_expand_slice_triton, + add_shrink, + add_expand, + add_expand_slice, ) if TYPE_CHECKING: @@ -71,18 +71,9 @@ def apply(self, x: torch.Tensor, dtype=torch.float32, device=x.device, ) - - # bgmv( - # buffer, - # x, - # self.lora_a_stacked, - # self.indices[: self.indices_len[0]], - # 0, - # 1.0, - # ) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[4]) - add_shrink_triton( + is_prefilling = bool(self.indices_len[5]) + add_shrink( buffer, x, self.lora_a_stacked, @@ -92,15 +83,7 @@ def apply(self, x: torch.Tensor, is_prefilling, ) buffer = tensor_model_parallel_all_gather(buffer) - # bgmv( - # output, - # buffer, - # self.lora_b_stacked, - # self.indices[: self.indices_len[0]], - # 0, - # 1.0, - # ) - add_expand_triton( + add_expand( output, buffer, self.lora_b_stacked, @@ -110,7 +93,6 @@ def apply(self, x: torch.Tensor, add_input=True, ) # now have column partitioned output - output = output.view(*out_orig_shape) return output @@ -138,7 +120,7 @@ def _mcp_apply(x, bias, layer): MergedColumnParallelLinearWithShardedLoRA and QKVParallelLinearWithShardedLora share the same LoRa weight application method. - + The main difference is the step by shard_size for lora_b which can vary for QKVParallelLinearWithShardedLora but is constant for MergedColumnParallelLinearWithShardedLoRA. @@ -155,18 +137,10 @@ def _mcp_apply(x, bias, layer): device=x.device, ) token_num = layer.indices_len[0] - is_prefilling = bool(layer.indices_len[4]) + is_prefilling = bool(layer.indices_len[5]) for idx in range(n): - # bgmv( - # buffers[idx], - # x, - # layer.lora_a_stacked[idx], - # layer.indices[: layer.indices_len[0]], - # 0, - # 1.0, - # ) - - add_shrink_triton( + + add_shrink( buffers[idx], x, layer.lora_a_stacked[idx], @@ -180,17 +154,7 @@ def _mcp_apply(x, bias, layer): left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] - # dispatch_bgmv_low_level( - # output, - # buffers[idx], - # layer.lora_b_stacked[idx], - # layer.indices[: layer.indices_len[0]], - # 0, - # 1.0, - # left_offset, - # shard_size, - # ) - add_expand_slice_triton( + add_expand_slice( output, buffers[idx], layer.lora_b_stacked[idx], @@ -328,17 +292,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: dtype=torch.float32, device=x.device, ) - # bgmv( - # buffer, - # x, - # self.lora_a_stacked, - # self.indices[: self.indices_len[0]], - # 0, - # 1.0, - # ) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[4]) - add_shrink_triton( + is_prefilling = bool(self.indices_len[5]) + add_shrink( buffer, x, self.lora_a_stacked, @@ -357,17 +313,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size - # dispatch_bgmv_low_level( - # output, - # buffer, - # self.lora_b_stacked, - # self.indices[: self.indices_len[0]], - # 0, - # 1.0, - # start_idx, - # shard_size, - # ) - add_expand_slice_triton( + add_expand_slice( output, buffer, self.lora_b_stacked, diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index abab47f34fdc..35c974fb6d5f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -21,8 +21,8 @@ # from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.punica import ( - add_lora_triton, - add_expand_triton, + add_lora, + add_expand, ) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -73,7 +73,7 @@ def dec(*args, **kwargs): return dec -def _apply_expand_triton( +def _apply_expand( x: torch.Tensor, lora_b_stacked: torch.Tensor, lora_index_tensor: torch.Tensor, @@ -86,7 +86,7 @@ def _apply_expand_triton( output = output.view(-1, output.shape[-1]) token_num = indices_info[0] is_prefilling = bool(indices_info[5]) - add_expand_triton( + add_expand( output, x, lora_b_stacked, @@ -98,15 +98,14 @@ def _apply_expand_triton( return output.view_as(org_output) -def _apply_lora_triton( - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - lora_index_tensor: torch.Tensor, - indices_info: List[int], - output: torch.Tensor, -) -> torch.Tensor: - """Applies lora to each input. This method applies all loras to each +def _apply_lora(x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + lora_index_tensor: torch.Tensor, + indices_info: List[int], + output: torch.Tensor, + cache_clear: bool = False) -> torch.Tensor: + """Applies lora to each input. This method applies all loras to each input. It uses the `lora_index_tensor` vector to determine which lora yields the correct output. An index of -1 means no lora should be applied. This method adds the final lora results to the output. @@ -117,9 +116,9 @@ def _apply_lora_triton( lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA index corresponding to each token - indices_info: List[int]: 5 is the number of indicies tensors. - # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,prefilling or decoding + indices_len(List):(6,), It contains (base_indices, sampler_indices, + sampler_indices_padded,embeddings_indices, long_lora_indices, + prefilling flag). output (torch.Tensor): (batch_size, output_dim) Returns: @@ -131,42 +130,34 @@ def _apply_lora_triton( output = output.view(-1, output.shape[-1]) token_num = indices_info[0] + is_prefilling = bool(indices_info[5]) - add_lora_triton( - output, - x, - lora_a_stacked, - lora_b_stacked, - lora_index_tensor[:token_num], - 0, - 1.0, - is_prefilling, - ) + add_lora(output, + x, + lora_a_stacked, + lora_b_stacked, + lora_index_tensor[:token_num], + 0, + 1.0, + is_prefilling, + cache_clear=cache_clear) return output.view_as(org_output) -def _apply_lora_triton_nslice( - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - lora_index_tensor: torch.Tensor, - indices_info: List[int], - output: torch.Tensor, - output_slices: Tuple[int, ...], -) -> torch.Tensor: - """_summary_ - - Args: - x (torch.Tensor): _description_ - lora_a_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_ - lora_b_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_ - lora_index_tensor (torch.Tensor): _description_ - indices_info (List[int]): _description_ - output (torch.Tensor): _description_ - output_slices (Tuple[int, ...]): _description_ - - Returns: - torch.Tensor: _description_ +def _apply_lora_packed_nslice(x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, + torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, + torch.Tensor], + lora_index_tensor: torch.Tensor, + indices_info: List[int], + output: torch.Tensor, + output_slices: Tuple[int, ...], + cache_clear: bool = False) -> torch.Tensor: + """ + Applies lora to each input. Similar to _apply_lora, This method is + used for layers that are composed of multiple sublayers + (slices) packed together. """ org_output = output x = x.view(-1, x.shape[-1]) @@ -177,18 +168,17 @@ def _apply_lora_triton_nslice( offset_left = 0 # TODO fuse these kernels for slice_idx in range(len(output_slices)): - add_lora_triton( - output, - x, - lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], - lora_index_tensor[:token_num], - 0, - 1.0, - is_prefilling, - offset_left, - output_slices[slice_idx], - ) + add_lora(output, + x, + lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], + lora_index_tensor[:token_num], + 0, + 1.0, + is_prefilling, + offset_left, + output_slices[slice_idx], + cache_clear=cache_clear) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -407,7 +397,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - _apply_expand_triton( + _apply_expand( full_lora_a_embeddings, self.lora_b_stacked, self.indices, @@ -526,7 +516,7 @@ def set_mapping( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_triton( + _apply_lora( x, self.lora_a_stacked, self.lora_b_stacked, @@ -687,7 +677,7 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_triton_nslice( + _apply_lora_packed_nslice( x, self.lora_a_stacked, self.lora_b_stacked, @@ -957,7 +947,7 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_triton_nslice( + _apply_lora_packed_nslice( x, self.lora_a_stacked, self.lora_b_stacked, @@ -1078,7 +1068,7 @@ def set_mapping( def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) # maybe we need not restrict range to [:batch_size] - _apply_lora_triton( + _apply_lora( x, self.lora_a_stacked, self.lora_b_stacked, @@ -1301,7 +1291,7 @@ def _get_logits( # sampler_indices sampler_indices = self.indices_len[1] is_prefilling = False - add_lora_triton( + add_lora( logits, hidden_states, self.lora_a_stacked, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 4cb977a25de1..b7923ce4de8e 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -84,10 +84,10 @@ def convert_mapping( long_lora_indices: Tensor of shape [batch_size] mapping requests to RoPE offsets and rot dims for long LoRAs. None if long context lora doesn't exist. - indices_len: List of lengths of the above tensors. - Used to index into each tensor. It contains length for + indices_len: List of lengths of the above tensors and prefilling + flag.Used to index into each tensor. It contains (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices,prefilling stage flag). + embeddings_indices, long_lora_indices,prefilling flag). """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() @@ -153,8 +153,10 @@ def convert_mapping( if long_lora_indices_len is not None: indices_len.append(long_lora_indices_len) else: - #If long_lora doesn'texist,append None + #If long_lora doesn't exist,append None indices_len.append(None) + # Append a prefilling flag to help selecting the appropriate lora + # ops (sgmv or bgmv) indices_len.append(int(mapping.is_prefilling)) return ( base_indices, diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index b977540cbfb4..ec68c6d20f98 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -103,8 +103,10 @@ def bgmv_expand( lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch batchs (int): batch size - add_inputs (bool, optional): _description_. Defaults to False. - cast_type (bool, optional): _description_. Defaults to False. + add_inputs (bool, optional): Defaults to False. adds the final lora + results to the output. + override_config (Optional[Dict[str, int]], optional): Defaults to None. + Triton grid config """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index c741d10e9c9d..af343d6eae1c 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -109,7 +109,9 @@ def bgmv_expand_slice( slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size batchs (int): batch size - add_inputs (bool, optional): _description_. Defaults to False. + add_inputs (bool, optional): Defaults to False. + override_config (Optional[Dict[str, int]], optional): Defaults to None. + Triton grid config """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index a7087a96488f..6b92ed72c4c2 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -97,6 +97,8 @@ def bgmv_shrink( corresponding to each batch batchs (int): batch size scaling (float): Scaling factor. + override_config (Optional[Dict[str, int]], optional): Defaults to None. + Triton grid config """ assert inputs.dtype == lora_a_weights.dtype assert inputs.dtype in [torch.float16, torch.bfloat16] diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index f34eec0357bd..879184db0b8b 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -122,8 +122,8 @@ def sgmv_expand( batchs (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch - add_inputs (bool, optional): _description_. Defaults to False. - cast_type (bool, optional): _description_. Defaults to False. + add_inputs (bool, optional): Defaults to False. adds the final lora + results to the output. """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 2fdedd591032..000fef304823 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -133,7 +133,8 @@ def sgmv_expand_slice( in the batch slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size - add_inputs (bool, optional): _description_. Defaults to False. + add_inputs (bool, optional): Defaults to False. adds the final lora + results to the output.. """ assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index 6124916cfd9d..e08b4409af75 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -1,6 +1,7 @@ import functools import json import os +import torch from typing import Dict @@ -9,8 +10,7 @@ def _get_config_file_name( batchs: int, hidden_size: int, ) -> str: - # device_name = torch.cuda.get_device_name().replace(" ", "_") - device_name = "NVIDIA_GeForce_RTX_3090" + device_name = torch.cuda.get_device_name().replace(" ", "_") return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " + f"device_name={device_name}.json") diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index ec4366acf456..695fd7446945 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -12,7 +12,12 @@ _PARAMS_CACHE: Dict[int, Tuple] = {} -def _compute_params(token_lora_tensor: torch.Tensor): +def _compute_params( + token_lora_tensor: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: + """ + Get the information required for the sgmv kernel. + """ pointer = token_lora_tensor.data_ptr() if pointer not in _PARAMS_CACHE: lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( @@ -36,6 +41,7 @@ def reset_params_cache(): """At the beginning of the prefilling stage, we need clear the cache explicitly """ + #TODO release gpu memory _PARAMS_CACHE.clear() @@ -46,7 +52,7 @@ def _get_prefilling_params(token_lora_tensor: torch.Tensor, return _compute_params(token_lora_tensor) -def add_shrink_triton( +def add_shrink( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, @@ -56,6 +62,10 @@ def add_shrink_triton( is_prefilling: bool, cache_clear: bool = False, ): + """ + y=x@w_t_all + When `is_prefilling` is True, will lanuch `sgmv_shrink` + """ if is_prefilling: ( b_seq_start_tensor, @@ -79,7 +89,7 @@ def add_shrink_triton( bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) -def add_expand_triton( +def add_expand( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, @@ -89,6 +99,10 @@ def add_expand_triton( add_input: bool = True, cache_clear: bool = False, ): + """ + y+=x@w_t_all + When `is_prefilling` is True, will lanuch `sgmv_expand`, + """ if is_prefilling: ( b_seq_start_tensor, @@ -112,7 +126,7 @@ def add_expand_triton( bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input) -def add_expand_slice_triton( +def add_expand_slice( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, @@ -124,6 +138,9 @@ def add_expand_slice_triton( add_input: bool = True, cache_clear: bool = False, ): + """ + y+=x@w_t_all + """ if is_prefilling: ( b_seq_start_tensor, @@ -157,7 +174,7 @@ def add_expand_slice_triton( ) -def add_lora_triton( +def add_lora( y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, @@ -173,9 +190,29 @@ def add_lora_triton( cache_clear: bool = False, ): """ - Same as `add_lora_triton` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + wa_t_all (torch.Tensor): lora_a's weight + wb_t_all (torch.Tensor): lora_b's weight + lora_indices_tensor (torch.Tensor): _description_ + layer_idx (int): Layer index of LoRA weights. + scale (float): Scaling factor. + is_prefilling (bool): prefiling stage + y_offset (Optional[int], optional): Offset to apply to the starting + column of y. + y_slice_size (Optional[int], optional): Size of the y column slice.. + buffer (Optional[torch.Tensor], optional): Defaults to None. + cache_clear (bool, optional): Defaults to False. """ + r = wb_t_all.size(-1) if buffer is None: # We set the buffer to be float32 by default ,refer to: @@ -184,7 +221,7 @@ def add_lora_triton( dtype=torch.float32, device=x.device) - add_shrink_triton( + add_shrink( buffer, x, wa_t_all, @@ -195,7 +232,7 @@ def add_lora_triton( cache_clear=cache_clear, ) if y_offset is None and y_slice_size is None: - add_expand_triton( + add_expand( y, buffer, wb_t_all, @@ -206,7 +243,7 @@ def add_lora_triton( cache_clear=cache_clear, ) else: - add_expand_slice_triton( + add_expand_slice( y, buffer, wb_t_all, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 34fbfa8e33ef..3b1a846f0d1b 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -93,6 +93,7 @@ def apply(self, if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) + return F.linear(x, weight, bias) From 348c4a4ea1c32efebd0f915f7cdb4abb41293d93 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 20 Jun 2024 13:48:10 +0800 Subject: [PATCH 29/71] reformat --- tests/lora/test_lora.py | 2 +- tests/lora/test_triton_punica.py | 149 ++++++++++++++------------- vllm/lora/fully_sharded_layers.py | 20 ++-- vllm/lora/layers.py | 38 +++---- vllm/lora/models.py | 19 ++-- vllm/lora/ops/utils.py | 3 +- vllm/lora/punica.py | 12 ++- vllm/model_executor/layers/linear.py | 1 - 8 files changed, 111 insertions(+), 133 deletions(-) diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 4bc959b826bb..51708c8fa6e5 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -107,7 +107,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None: lora_1 = manager.get_module_lora(module_name + "1") manager.init_random_lora(module_name + "2", weight, rank=rank) lora_2 = manager.get_module_lora(module_name + "2") - + input = torch.rand(k, n, device="cuda", dtype=dtype) expected = torch.cat([ input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index d4281004a7a2..9aa210db7073 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -324,81 +324,82 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, # ref_out_tensor = ref_out_tensor.to(torch.float32) # assert_close(our_out_tensor, ref_out_tensor) -# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -# @pytest.mark.parametrize("scaling", SCALES) -# @pytest.mark.parametrize("dtype", DTYPES) -# @pytest.mark.parametrize("op_type", OP_TYPES) -# @pytest.mark.parametrize("seed", SEED) -# @pytest.mark.parametrize("device", CUDA_DEVICES) -# def test_triton_sgmv_punica_bgmv( -# hidden_size, -# scaling: float, -# dtype: torch.dtype, -# op_type: str, -# seed: int, -# device: str, -# ): -# # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error -# if dtype == torch.float32 or hidden_size == 3424: -# return -# torch.manual_seed(seed) -# torch.set_default_device(device) -# batchs = 4 # Arbitrary values for testing -# rank = 16 # Arbitrary values for testing -# seq_len = 128 # Arbitrary values for testing -# num_loras = 8 # Arbitrary values for testing -# ( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# ref_out_tensor, -# b_seq_start_loc, -# lora_indices_tensor, -# seq_len_tensor, -# indices, -# ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, -# op_type, device) -# max_seq_length = seq_len_tensor.max() -# if isinstance(max_seq_length, tuple): -# max_seq_length = max_seq_length[0].item() -# else: -# max_seq_length = max_seq_length.item() -# if op_type == "shrink": -# sgmv_shrink( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# b_seq_start_loc, -# seq_len_tensor, -# lora_indices_tensor, -# batchs, -# max_seq_length, -# scaling, -# ) -# else: -# sgmv_expand( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# b_seq_start_loc, -# seq_len_tensor, -# lora_indices_tensor, -# batchs, -# max_seq_length, -# add_inputs=True, -# ) -# lora_weights_4d = lora_weights.unsqueeze(dim=1) -# _punica_bgmv( -# ref_out_tensor, -# inputs_tensor, -# lora_weights_4d, -# indices, -# scaling if op_type == "shrink" else 1.0, -# ) -# if op_type == "shrink": -# ref_out_tensor = ref_out_tensor.to(torch.float32) -# assert_close(our_out_tensor, ref_out_tensor) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_triton_sgmv_punica_bgmv( + hidden_size, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error + if dtype == torch.float32 or hidden_size == 3424: + return + torch.manual_seed(seed) + torch.set_default_device(device) + batchs = 4 # Arbitrary values for testing + rank = 16 # Arbitrary values for testing + seq_len = 128 # Arbitrary values for testing + num_loras = 8 # Arbitrary values for testing + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + op_type, device) + + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + add_inputs=True, + ) + lora_weights_4d = lora_weights.unsqueeze(dim=1) + _punica_bgmv( + ref_out_tensor, + inputs_tensor, + lora_weights_4d, + indices, + scaling if op_type == "shrink" else 1.0, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) @pytest.mark.parametrize("batchs", BATCHS) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 76544e1d51ca..a2cb031c8ca9 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -7,21 +7,13 @@ from vllm.config import LoRAConfig from vllm.distributed.communication_op import ( - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce, -) + tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_tensor_model_parallel_rank -from vllm.lora.layers import ( - ColumnParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLora, - RowParallelLinearWithLoRA, -) -from vllm.lora.punica import ( - add_shrink, - add_expand, - add_expand_slice, -) +from vllm.lora.layers import (ColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithLoRA, + MergedQKVParallelLinearWithLora, + RowParallelLinearWithLoRA) +from vllm.lora.punica import add_expand, add_expand_slice, add_shrink if TYPE_CHECKING: pass diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 35c974fb6d5f..80bd2dd765ea 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,6 +1,6 @@ # pylint: disable=unused-argument import math -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch @@ -9,34 +9,24 @@ from transformers import PretrainedConfig from vllm.config import LoRAConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce, - tensor_model_parallel_gather, -) +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, + tensor_model_parallel_gather) from vllm.distributed.utils import divide - # from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.punica import ( - add_lora, - add_expand, -) -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.lora.punica import add_expand, add_lora +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( - LinearScalingRotaryEmbedding, - RotaryEmbedding, -) + LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ) + VocabParallelEmbedding) if TYPE_CHECKING: pass diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b7923ce4de8e..d34725523c9c 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -12,22 +12,15 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import ( - BaseLayerWithLoRA, - LinearScalingRotaryEmbeddingWithLora, - LoRAMapping, -) +from vllm.lora import punica +from vllm.lora.layers import (BaseLayerWithLoRA, + LinearScalingRotaryEmbeddingWithLora, + LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.utils import ( - from_layer, - from_layer_logits_processor, - parse_fine_tuned_lora_name, - replace_submodule, -) +from vllm.lora.utils import (from_layer, from_layer_logits_processor, + parse_fine_tuned_lora_name, replace_submodule) from vllm.utils import LRUCache, is_pin_memory_available -from vllm.lora import punica - logger = init_logger(__name__) _GLOBAL_LORA_ID = 0 diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index e08b4409af75..980dc8c6693f 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -1,9 +1,10 @@ import functools import json import os -import torch from typing import Dict +import torch + def _get_config_file_name( op_type: str, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 695fd7446945..321fccc9df93 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -1,7 +1,9 @@ # Based on code from https://github.com/punica-ai/punica -from typing import Optional, Dict, Tuple +from typing import Dict, Optional, Tuple + import torch + from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink @@ -64,7 +66,7 @@ def add_shrink( ): """ y=x@w_t_all - When `is_prefilling` is True, will lanuch `sgmv_shrink` + When `is_prefilling` is True, will launch `sgmv_shrink` """ if is_prefilling: ( @@ -101,7 +103,7 @@ def add_expand( ): """ y+=x@w_t_all - When `is_prefilling` is True, will lanuch `sgmv_expand`, + When `is_prefilling` is True, will launch `sgmv_expand`, """ if is_prefilling: ( @@ -133,8 +135,8 @@ def add_expand_slice( lora_indices_tensor: torch.Tensor, layer_idx: int, is_prefilling: bool, - y_offset: int, - y_slice_size: int, + y_offset: Optional[int], + y_slice_size: Optional[int], add_input: bool = True, cache_clear: bool = False, ): diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 3b1a846f0d1b..34fbfa8e33ef 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -93,7 +93,6 @@ def apply(self, if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) - return F.linear(x, weight, bias) From fa27688239e2160cdd8cfffbc7eb793fe26a906a Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 21 Jun 2024 01:22:46 +0800 Subject: [PATCH 30/71] update --- vllm/lora/layers.py | 22 ----------- vllm/lora/punica.py | 89 --------------------------------------------- 2 files changed, 111 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 00bd6278bb0e..a4deff6c221f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -258,27 +258,6 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None: self.embeddings_weights: Optional[torch.Tensor] def create_lora_weights( -<<<<<<< HEAD - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - lora_vocab_start_idx = self.base_layer.org_vocab_size - weights_idx = None - if self.base_layer.vocab_end_index > lora_vocab_start_idx: - # We can start adding lora weights - weights_idx = max( - lora_vocab_start_idx - self.base_layer.vocab_start_index, 0) - self.embeddings_slice = ( - self.base_layer.vocab_start_index - - self.base_layer.org_vocab_size + weights_idx, - self.base_layer.vocab_end_index - - self.base_layer.org_vocab_size, - ) - self.embeddings_weights = self.base_layer.weight.data[weights_idx:] - self.embeddings_weights.fill_(0) -======= self, max_loras: int, lora_config: LoRAConfig, @@ -297,7 +276,6 @@ def create_lora_weights( self.base_layer.org_vocab_size) self.base_layer.weight.data[ self.base_layer.num_org_embeddings_per_partition:].fill_(0) ->>>>>>> main else: self.embeddings_slice = None self.embeddings_weights = None diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index a1418d23c5e3..321fccc9df93 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,7 +4,6 @@ import torch -<<<<<<< HEAD from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink @@ -47,23 +46,6 @@ def reset_params_cache(): #TODO release gpu memory _PARAMS_CACHE.clear() -======= -from vllm import _custom_ops as ops - - -def _check_punica_support(): - if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"): - return - - if torch.cuda.get_device_capability() < (8, 0): - raise ImportError( - "punica LoRA kernels require compute capability >= 8.0") - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") ->>>>>>> main def _get_prefilling_params(token_lora_tensor: torch.Tensor, cache_clear: bool = False): @@ -86,7 +68,6 @@ def add_shrink( y=x@w_t_all When `is_prefilling` is True, will launch `sgmv_shrink` """ -<<<<<<< HEAD if is_prefilling: ( b_seq_start_tensor, @@ -108,11 +89,6 @@ def add_shrink( ) else: bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) -======= - _check_punica_support() - - ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) ->>>>>>> main def add_expand( @@ -129,7 +105,6 @@ def add_expand( y+=x@w_t_all When `is_prefilling` is True, will launch `sgmv_expand`, """ -<<<<<<< HEAD if is_prefilling: ( b_seq_start_tensor, @@ -151,21 +126,6 @@ def add_expand( ) else: bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input) -======= - _check_punica_support() - - ops.dispatch_bgmv_low_level( - y, - x, - w_t_all, - indicies, - layer_idx, - scale, - x.size(1), - y_slice_size, - y_offset, - ) ->>>>>>> main def add_expand_slice( @@ -183,7 +143,6 @@ def add_expand_slice( """ y+=x@w_t_all """ -<<<<<<< HEAD if is_prefilling: ( b_seq_start_tensor, @@ -216,36 +175,6 @@ def add_expand_slice( add_inputs=add_input, ) -======= - _check_punica_support() - - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) - ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale) - - -def add_lora_slice(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None): - """ - Same as `add_lora` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. ->>>>>>> main def add_lora( y: torch.Tensor, @@ -285,10 +214,6 @@ def add_lora( buffer (Optional[torch.Tensor], optional): Defaults to None. cache_clear (bool, optional): Defaults to False. """ -<<<<<<< HEAD -======= - _check_punica_support() ->>>>>>> main r = wb_t_all.size(-1) if buffer is None: @@ -297,27 +222,13 @@ def add_lora( buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) -<<<<<<< HEAD add_shrink( -======= - ops.dispatch_bgmv_low_level( ->>>>>>> main buffer, x, wa_t_all, lora_indices_tensor, 0, -<<<<<<< HEAD -======= - ) - ops.dispatch_bgmv_low_level( - y, - buffer, - wb_t_all, - indicies, - layer_idx, ->>>>>>> main scale, is_prefilling, cache_clear=cache_clear, From 0f71cc4cdb24f6d7f54a62319b06323c3e46aca4 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 21 Jun 2024 21:48:30 +0800 Subject: [PATCH 31/71] delete punica test --- tests/lora/test_punica.py | 257 -------------------------------------- 1 file changed, 257 deletions(-) delete mode 100644 tests/lora/test_punica.py diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py deleted file mode 100644 index 110c9b243507..000000000000 --- a/tests/lora/test_punica.py +++ /dev/null @@ -1,257 +0,0 @@ -# Based on code from https://github.com/punica-ai/punica - -import pytest -import torch - -import vllm.lora.punica as punica - - -def assert_close(a, b): - rtol, atol = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), - torch.float32: (None, None), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - - -def _lora_ref_impl( - y_final: torch.Tensor, - x: torch.Tensor, - wa_T_all: torch.Tensor, - wb_T_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, -): - y_stage_1 = torch.empty( - (x.size(0), wa_T_all.size(-2)), - dtype=torch.float32, - device=x.device, - ) - bs = x.shape[0] - s = torch.tensor(scale, dtype=torch.float32, device=x.device) - for i, lora_idx in zip(range(bs), indicies.cpu().tolist()): - xi = x[i].unsqueeze(0).to(torch.float32) - wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) - if wb_T_all is not None: - wb = wb_T_all[lora_idx, layer_idx].transpose(-1, - -2).to(torch.float32) - - tmp = xi @ wa - y_stage_1[i] = tmp.squeeze(0) - y_final[i] += ((tmp @ wb).squeeze(0) * - s if wb_T_all is not None else y_stage_1[i]) - return y_final, y_stage_1 - - -H1 = H2 = [ - 128, - 256, - 512, - 896, - 1024, - 1152, - 1216, - 1280, - 1536, - 1664, - 2048, - 2240, - 2304, - 2368, - 2432, - 2560, - 2752, - 3072, - 3328, - 3456, - 3584, - 3712, - 4096, - 4480, - 4608, - 4736, - 4864, - 5120, - 5504, - 5632, - 5888, - 6144, - 6400, - 6848, - 6912, - 7168, - 7424, - 8192, - 8960, - 9216, - 9472, - 10240, - 11008, - 11264, - 13824, - 14336, - 14784, - 14848, - 15360, - 18944, - 22016, - 22528, - 24576, - 27392, - 27648, - 29568, - 29696, - 32000, - 32256, - 32512, - 32768, - 33024, - 36864, - 43264, - 49152, - 60544, - 60672, - 64000, - 64256, - 102400, - 102656, - 128000, - 128256, -] -H2 = [64] + H2 -R = [1, 2, 4] -SEED = [0xabcdabcd987] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("r", R) -@pytest.mark.parametrize("seed", SEED) -@torch.inference_mode() -def test_lora_a_extra_shapes(dtype_str, h1, r, seed): - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - bs = 32 - dtype = getattr(torch, dtype_str) - device = torch.device("cuda") - - wa_T_all = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype, device=device) - y = torch.randn(bs, r, dtype=dtype, device=device) - - y_ref = y.clone() - _lora_ref_impl( - y_ref, - x, - wa_T_all, - None, - indices, - layer_idx, - 1.0, - ) - - y_our = y.clone() - punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0) - - assert_close(y_ref, y_our) - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_lora_correctness(dtype_str, h1, h2, seed, device): - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - torch.set_default_device(device) - - wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype) - indices = torch.randint(num_loras, (bs, ), dtype=torch.long) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype) - y = torch.randn(bs, h2, dtype=dtype) - - y_ref = y.clone() - _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale) - - y_our = y.clone() - punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx, - scale) - - assert_close(y_ref, y_our) - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_lora_correctness_slice(dtype_str, h1, h2, seed, device): - if h2 % 3 != 0 or h2 // 3 not in H1: - pytest.skip("h2 must be divisible by 3 and in supported shapes") - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - torch.set_default_device(device) - - wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype) - wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype) - - indices = torch.randint(num_loras, (bs, ), dtype=torch.long) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype) - y = torch.randn(bs, h2, dtype=dtype) - s = h2 // 3 - - y_ref = y.clone() - _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale) - - y_our = y.clone() - punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale, 0, s) - punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale, s, s) - punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale, s * 2, s) - - assert_close(y_ref[:, :s], y_our[:, :s]) - assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2]) - assert_close(y_ref[:, s * 2:], y_our[:, s * 2:]) From b36a92e00ba332eb9014c1e396eacc0b0ae418c4 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 25 Jun 2024 15:00:11 +0800 Subject: [PATCH 32/71] fix bug --- vllm/worker/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 69ab24a872e4..c88221de127b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -635,7 +635,7 @@ def _prepare_model_input( if self.lora_config: lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping, - bool(attn_metadata.prefill_metadata)) + is_prompt) else: lora_mapping = None From 6f06eb8455c46edd48e12bf3da8c9f2308aa6da5 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 26 Jun 2024 16:43:38 +0800 Subject: [PATCH 33/71] optimize unit test --- tests/lora/test_triton_punica.py | 477 ++++++++++------------------- vllm/lora/ops/bgmv_expand.py | 5 - vllm/lora/ops/bgmv_expand_slice.py | 3 +- vllm/lora/ops/bgmv_shrink.py | 5 +- 4 files changed, 169 insertions(+), 321 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index e55f1373aa2a..56df321714a4 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -1,7 +1,8 @@ +import random + import pytest import torch -from vllm._custom_ops import dispatch_bgmv, dispatch_bgmv_low_level from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink @@ -9,7 +10,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink -# The current punica kernel supports dimension and adds a dimension of 3424. HIDDEN_SIZES = [ 128, 256, @@ -62,19 +62,14 @@ 128256, ] -_BATCH_SIZE_ALIGNMENT = 8 - -# vllm support batch size -BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)] +BATCHS = [1, 2, 4] + [8 * i for i in range(1, 4)] -NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256] +NUM_LORA = [1, 4, 8, 16, 32, 64, 128] DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +MAX_RANKS = [8, 16, 32, 64] SCALES = [0.5] -OP_TYPES = ["shrink", "expand"] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] -NSLICES = [2, 3] def assert_close(a, b): @@ -86,14 +81,6 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -@torch.inference_mode() -def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling): - layer_idx = 0 - dispatch_bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, - scaling) - return - - def _torch_groupgemm( out_tensor, inputs, @@ -121,11 +108,10 @@ def _torch_groupgemm( return -def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, +def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype, op_type, device): - if max_length == 1: - max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batchs, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, @@ -143,11 +129,8 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, dtype=dtype, device=inputs_tensor.device) # NOTE shrink kernel using torch.float32 as output type - our_out_tensor = torch.zeros( - (total_tokens, max_rank), - dtype=torch.float32, - device=inputs_tensor.device, - ) + our_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=torch.float32).to(device) else: inputs_tensor = torch.rand( (total_tokens, max_rank), @@ -162,11 +145,9 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, ref_out_tensor = torch.rand( (total_tokens, hidden_size), dtype=dtype, - device=inputs_tensor.device, - ) + ).to(device) # Ensure the same input. our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint(0, lora_nums - 1 if lora_nums > 1 else 1, (batchs, )).to(device) @@ -175,7 +156,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, for b_id in range(batchs): lora_index = lora_indices_tensor[b_id] indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() + seq_len_tensor[b_id]].copy_(lora_index) current_offset += seq_len_tensor[b_id].item() return ( inputs_tensor, @@ -190,164 +171,86 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype, def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, - max_length, dtype, nslices, device): - if max_length == 1: - max_length += 1 - seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device) - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).to(device) - total_tokens = seq_len_tensor.sum() - - inputs_tensor = torch.rand( - (total_tokens, max_rank), - dtype=dtype, - ).to(device) - lora_weights_lst = [] - for _ in range(nslices): - lora_weights_lst.append( - torch.rand( - (lora_nums, hidden_size, max_rank), # col-major - dtype=dtype, - ).to(device)) - # expand op needs to complete y+=a@lora_b, so output is - # initinized randomly - ref_out_tensor = torch.rand( - (total_tokens, hidden_size * nslices), - dtype=dtype, - device=inputs_tensor.device, - ) - # Ensure the same input. - our_out_tensor = ref_out_tensor.clone() - - lora_indices_tensor = torch.randint(0, - lora_nums - 1 if lora_nums > 1 else 1, - (batchs, )).to(device) - indices = torch.zeros((total_tokens), dtype=torch.long).to(device) - current_offset = 0 - for b_id in range(batchs): - lora_index = lora_indices_tensor[b_id] - indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() - current_offset += seq_len_tensor[b_id].item() - return ( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) - - -# @pytest.mark.parametrize("batchs", BATCHS) -# @pytest.mark.parametrize("num_loras", NUM_LORA) -# @pytest.mark.parametrize("rank", MAX_RANKS) -# @pytest.mark.parametrize("scaling", SCALES) -# @pytest.mark.parametrize("dtype", DTYPES) -# @pytest.mark.parametrize("op_type", OP_TYPES) -# @pytest.mark.parametrize("seed", SEED) -# @pytest.mark.parametrize("device", CUDA_DEVICES) -# def test_sgmv_torch( -# batchs: int, -# num_loras: int, -# rank: int, -# scaling: float, -# dtype: torch.dtype, -# op_type: str, -# seed: int, -# device: str, -# ): -# torch.manual_seed(seed) -# torch.set_default_device(device) -# if batchs == 0: -# batchs += 1 -# hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) -# hidden_size = HIDDEN_SIZES[hidden_size_index] -# if hidden_size > 100000: -# hidden_size = hidden_size // 4 # avoid OOM -# ( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# ref_out_tensor, -# b_seq_start_loc, -# lora_indices_tensor, -# seq_len_tensor, -# indices, -# ) = _generate_data( -# batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device -# ) # The sequence length is restricted to the range [1, 1024]. -# max_seq_length = seq_len_tensor.max() -# if isinstance(max_seq_length, tuple): -# max_seq_length = max_seq_length[0].item() -# else: -# max_seq_length = max_seq_length.item() -# if op_type == "shrink": -# sgmv_shrink( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# b_seq_start_loc, -# seq_len_tensor, -# lora_indices_tensor, -# batchs, -# max_seq_length, -# scaling, -# ) -# else: -# sgmv_expand( -# inputs_tensor, -# lora_weights, -# our_out_tensor, -# b_seq_start_loc, -# seq_len_tensor, -# lora_indices_tensor, -# batchs, -# max_seq_length, -# add_inputs=True, -# ) -# _torch_groupgemm( -# ref_out_tensor, -# inputs_tensor, -# lora_weights, -# lora_indices_tensor, -# seq_len_tensor, -# batchs, -# scaling if op_type == "shrink" else 1.0, -# op_type, -# ) -# if op_type == "shrink": -# ref_out_tensor = ref_out_tensor.to(torch.float32) -# assert_close(our_out_tensor, ref_out_tensor) + seq_length, dtype, nslices, device): + try: + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batchs, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights_lst = [] + for _ in range(nslices): + lora_weights_lst.append( + torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device)) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), + dtype=dtype).to(device) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() + lora_indices_tensor = torch.randint( + 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs, )) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batchs): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() + current_offset += seq_len_tensor[b_id].item() + + lora_indices_tensor = lora_indices_tensor.to(device) + return ( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + except Exception as error: + raise error -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) -def test_triton_sgmv_punica_bgmv( - hidden_size, +def test_punica_sgmv( + batchs: int, + num_loras: int, + rank: int, scaling: float, dtype: torch.dtype, op_type: str, seed: int, device: str, ): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) + random.seed(seed) torch.set_default_device(device) - batchs = 4 # Arbitrary values for testing - rank = 16 # Arbitrary values for testing - seq_len = 128 # Arbitrary values for testing - num_loras = 8 # Arbitrary values for testing + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) + hidden_size = HIDDEN_SIZES[hidden_size_index] + if hidden_size > 100000: + hidden_size = hidden_size // 4 # avoid OOM + seq_length = 128 ( inputs_tensor, lora_weights, @@ -357,9 +260,8 @@ def test_triton_sgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype, op_type, device) - max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() @@ -389,13 +291,15 @@ def test_triton_sgmv_punica_bgmv( max_seq_length, add_inputs=True, ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( + _torch_groupgemm( ref_out_tensor, inputs_tensor, - lora_weights_4d, - indices, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, scaling if op_type == "shrink" else 1.0, + op_type, ) if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) @@ -403,31 +307,34 @@ def test_triton_sgmv_punica_bgmv( @pytest.mark.parametrize("batchs", BATCHS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", OP_TYPES) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) -def test_triton_bgmv_punica_bgmv( +def test_punica_bgmv( batchs: int, - hidden_size: int, + num_loras: int, + rank: int, scaling: float, dtype: torch.dtype, op_type: str, seed: int, device: str, ): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) + random.seed(seed) torch.set_default_device(device) - if batchs == 0: - batchs += 1 - rank = 16 - seq_len = 1 # - num_loras = 8 # Arbitrary values for testing + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) + hidden_size = HIDDEN_SIZES[hidden_size_index] + if hidden_size > 100000: + hidden_size = hidden_size // 4 # avoid OOM + seq_length = 1 ( inputs_tensor, lora_weights, @@ -437,15 +344,14 @@ def test_triton_bgmv_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype, + ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype, op_type, device) - if op_type == "shrink": bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, - lora_indices_tensor, + indices, scaling, ) else: @@ -453,16 +359,18 @@ def test_triton_bgmv_punica_bgmv( inputs_tensor, lora_weights, our_out_tensor, - lora_indices_tensor, + indices, add_inputs=True, ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - _punica_bgmv( + _torch_groupgemm( ref_out_tensor, inputs_tensor, - lora_weights_4d, - indices, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batchs, scaling if op_type == "shrink" else 1.0, + op_type, ) if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) @@ -470,27 +378,33 @@ def test_triton_bgmv_punica_bgmv( @pytest.mark.parametrize("batchs", BATCHS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("nslices", NSLICES) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sgmv_expand_slice( +def test_punica_expand_nslices( batchs: int, - hidden_size: int, + num_loras: int, + rank: int, nslices: int, - dtype: str, + dtype: torch.dtype, + op_type: str, seed: int, device: str, ): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) + random.seed(seed) torch.set_default_device(device) - max_rank = 16 - lora_nums = 4 - max_length = 128 + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) + hidden_size = HIDDEN_SIZES[hidden_size_index] + if hidden_size > 100000: + hidden_size = hidden_size // 4 # avoid OOM + seq_length = 128 if op_type == "sgmv" else 1 ( inputs_tensor, lora_weights_lst, @@ -503,9 +417,9 @@ def test_sgmv_expand_slice( ) = _generate_data_expand_nslices( batchs, hidden_size, - lora_nums, - max_rank, - max_length, + num_loras, + rank, + seq_length, dtype, nslices, device, @@ -518,109 +432,48 @@ def test_sgmv_expand_slice( slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - sgmv_expand_slice( + if op_type == "sgmv": + sgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batchs, + max_seq_length, + slice_offset, + hidden_size, + add_inputs=True, + ) + else: + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + _torch_groupgemm( + ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, lora_weights, - our_outputs, - b_seq_start_loc, - seq_len_tensor, lora_indices_tensor, + seq_len_tensor, batchs, - max_seq_length, - slice_offset, - hidden_size, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - dispatch_bgmv_low_level( - ref_outputs, - inputs_tensor, - lora_weights_4d, - indices, - 0, 1.0, - inputs_tensor.size(1), - hidden_size, - slice_offset, + op_type="expand", ) - slice_offset += hidden_size - assert_close(our_outputs, ref_outputs) - -@pytest.mark.parametrize("batchs", BATCHS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("nslices", NSLICES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_bgmv_expand_slice( - batchs: int, - hidden_size: int, - nslices: int, - dtype: str, - seed: int, - device: str, -): - # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error - if dtype == torch.float32 or hidden_size == 3424: - return - torch.manual_seed(seed) - torch.set_default_device(device) - max_rank = 64 - lora_nums = 8 - ( - inputs_tensor, - lora_weights_lst, - our_outputs, - ref_outputs, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = _generate_data_expand_nslices( - batchs, - hidden_size, - lora_nums, - max_rank, - 1, - dtype, - nslices, - device, - ) - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - lora_indices_tensor, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - lora_weights_4d = lora_weights.unsqueeze(dim=1) - dispatch_bgmv_low_level( - ref_outputs, - inputs_tensor, - lora_weights_4d, - lora_indices_tensor, - 0, - 1.0, - inputs_tensor.size(1), - hidden_size, - slice_offset, - ) slice_offset += hidden_size assert_close(our_outputs, ref_outputs) if __name__ == "__main__": - test_bgmv_expand_slice( - batchs=32, - hidden_size=128, - nslices=2, - dtype=torch.bfloat16, - seed=0, - device="cuda:0", - ) + # cuda:0-0-bgmv-dtype1-3-32-16-24 + for _ in range(1000): + test_punica_expand_nslices(24, 16, 32, 3, torch.bfloat16, "bgmv", 0, + "cuda:0") + print("ssss") diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index ec68c6d20f98..998095c412e6 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -124,15 +124,11 @@ def bgmv_expand( lora_b_weights = lora_b_weights.squeeze(dim=1) else: assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank) - assert lora_b_weights.is_contiguous() # TODO tuning this config - N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - # BLOCK_N =64 BLOCK_K = triton.next_power_of_2(K) - # SPLIT_N = 8 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False @@ -142,7 +138,6 @@ def bgmv_expand( ]: CAST_TYPE = True batchs = lora_indices_tensor.size(0) - if override_config: config = override_config else: diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index af343d6eae1c..071dbe40f216 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -110,7 +110,7 @@ def bgmv_expand_slice( slice_size (int): current output_tensor's size batchs (int): batch size add_inputs (bool, optional): Defaults to False. - override_config (Optional[Dict[str, int]], optional): Defaults to None. + override_config (Optional[Dict[str, int]], optional): Defaults to None. Triton grid config """ @@ -138,6 +138,7 @@ def bgmv_expand_slice( N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size # BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) + # SPLIT_N = 64 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 6b92ed72c4c2..3258a60d2455 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -48,7 +48,6 @@ def _bgmv_shrink_kernel( offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K a_ptr = input_ptr + cur_batch * xm_stride b_ptr = lora_ptr + l0_stride * lora_index - rank_mask = offset_n[:, None] < N accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32) for k in range(0, K, BLOCK_K * SPLIT_K): current_k = k + offset_k @@ -59,7 +58,7 @@ def _bgmv_shrink_kernel( mask=current_k < K, other=0.0, ) # [BLOCK_K] - b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K) + b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K) tiled_b = tl.load( b_ptr + offset_n[:, None] * lora_k_stride + @@ -119,7 +118,7 @@ def bgmv_shrink( # TODO tuning this config batchs = lora_indices_tensor.size(0) N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank - BLOCK_N = triton.next_power_of_2(output_tensor.size(1)) + BLOCK_N = triton.next_power_of_2(N) if override_config: config = override_config else: From 0e7dde342be124a8367377b1a3007f5cf0a35480 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 26 Jun 2024 23:44:03 +0800 Subject: [PATCH 34/71] verify mem --- tests/lora/test_triton_punica.py | 8 -------- vllm/lora/models.py | 1 + vllm/lora/punica.py | 1 + 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 56df321714a4..a5ccf847afe7 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -469,11 +469,3 @@ def test_punica_expand_nslices( slice_offset += hidden_size assert_close(our_outputs, ref_outputs) - - -if __name__ == "__main__": - # cuda:0-0-bgmv-dtype1-3-32-16-24 - for _ in range(1000): - test_punica_expand_nslices(24, 16, 32, 3, torch.bfloat16, "bgmv", 0, - "cuda:0") - print("ssss") diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5010ac94d643..10ea1e69ce8a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -615,6 +615,7 @@ def remove_all_loras(self): self._registered_loras.clear() self.lora_index_to_id = [None] * self.lora_slots self._active_loras.clear() + punica.reset_params_cache() def _create_lora_modules(self): for module_name, module in self.model.named_modules( diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 321fccc9df93..e0b441e1dd08 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -44,6 +44,7 @@ def reset_params_cache(): cache explicitly """ #TODO release gpu memory + torch.cuda.empty_cache() _PARAMS_CACHE.clear() From 7419d19f457826ba52bb3582cb9ee18d31d2fccd Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 27 Jun 2024 01:07:41 +0800 Subject: [PATCH 35/71] Trigger CI From 5fbb2a84c7beaba6a854ec2589b454b98faa64d0 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Sat, 29 Jun 2024 13:42:32 +0800 Subject: [PATCH 36/71] update --- tests/lora/test_triton_punica.py | 58 +++++++++++++++--------------- vllm/lora/layers.py | 41 ++++++--------------- vllm/lora/models.py | 7 ++-- vllm/lora/ops/bgmv_expand.py | 8 ++--- vllm/lora/ops/bgmv_expand_slice.py | 8 ++--- vllm/lora/ops/bgmv_shrink.py | 8 ++--- vllm/lora/ops/sgmv_expand.py | 10 +++--- vllm/lora/ops/sgmv_expand_slice.py | 10 +++--- vllm/lora/ops/sgmv_shrink.py | 10 +++--- vllm/lora/ops/utils.py | 6 ++-- vllm/lora/punica.py | 2 +- 11 files changed, 73 insertions(+), 95 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index a5ccf847afe7..eea190b153a1 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -62,7 +62,7 @@ 128256, ] -BATCHS = [1, 2, 4] + [8 * i for i in range(1, 4)] +batches = [1, 2, 4] + [8 * i for i in range(1, 4)] NUM_LORA = [1, 4, 8, 16, 32, 64, 128] DTYPES = [torch.float16, torch.bfloat16] @@ -87,13 +87,13 @@ def _torch_groupgemm( lora_weights, lora_indices_tensor, seq_len_tensor, - batchs, + batches, scaling, op_type, ) -> torch.Tensor: out_list = [] current_offset = 0 - for lora_index, b_length in zip(range(batchs), seq_len_tensor): + for lora_index, b_length in zip(range(batches), seq_len_tensor): input_weight = inputs[current_offset:b_length + current_offset, :] current_offset += b_length lora_weight = lora_weights[lora_indices_tensor[lora_index]] @@ -108,10 +108,10 @@ def _torch_groupgemm( return -def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype, - op_type, device): +def _generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, + dtype, op_type, device): seq_len_tensor = torch.randint(seq_length, seq_length + 1, - (batchs, )).to(device) + (batches, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, @@ -150,10 +150,10 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype, our_out_tensor = ref_out_tensor.clone() lora_indices_tensor = torch.randint(0, lora_nums - 1 if lora_nums > 1 else 1, - (batchs, )).to(device) + (batches, )).to(device) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 - for b_id in range(batchs): + for b_id in range(batches): lora_index = lora_indices_tensor[b_id] indices[current_offset:current_offset + seq_len_tensor[b_id]].copy_(lora_index) @@ -170,11 +170,11 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype, ) -def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, +def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, nslices, device): try: seq_len_tensor = torch.randint(seq_length, seq_length + 1, - (batchs, )).to(device) + (batches, )).to(device) b_seq_start_loc = torch.cumsum( torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), dim=0, @@ -198,10 +198,10 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, # Ensure the same input. our_out_tensor = ref_out_tensor.clone() lora_indices_tensor = torch.randint( - 0, lora_nums - 1 if lora_nums > 1 else 1, (batchs, )) + 0, lora_nums - 1 if lora_nums > 1 else 1, (batches, )) indices = torch.zeros((total_tokens), dtype=torch.long).to(device) current_offset = 0 - for b_id in range(batchs): + for b_id in range(batches): lora_index = lora_indices_tensor[b_id] indices[current_offset:current_offset + seq_len_tensor[b_id]] = lora_index.item() @@ -222,7 +222,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, raise error -@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("batches", batches) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @@ -231,7 +231,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank, @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_punica_sgmv( - batchs: int, + batches: int, num_loras: int, rank: int, scaling: float, @@ -260,8 +260,8 @@ def test_punica_sgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype, - op_type, device) + ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length, + dtype, op_type, device) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() @@ -275,7 +275,7 @@ def test_punica_sgmv( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, - batchs, + batches, max_seq_length, scaling, ) @@ -287,7 +287,7 @@ def test_punica_sgmv( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, - batchs, + batches, max_seq_length, add_inputs=True, ) @@ -297,7 +297,7 @@ def test_punica_sgmv( lora_weights, lora_indices_tensor, seq_len_tensor, - batchs, + batches, scaling if op_type == "shrink" else 1.0, op_type, ) @@ -306,7 +306,7 @@ def test_punica_sgmv( assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("batches", batches) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @@ -315,7 +315,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_punica_bgmv( - batchs: int, + batches: int, num_loras: int, rank: int, scaling: float, @@ -344,8 +344,8 @@ def test_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype, - op_type, device) + ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length, + dtype, op_type, device) if op_type == "shrink": bgmv_shrink( inputs_tensor, @@ -368,7 +368,7 @@ def test_punica_bgmv( lora_weights, lora_indices_tensor, seq_len_tensor, - batchs, + batches, scaling if op_type == "shrink" else 1.0, op_type, ) @@ -377,7 +377,7 @@ def test_punica_bgmv( assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.parametrize("batchs", BATCHS) +@pytest.mark.parametrize("batches", batches) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("nslices", [2, 3]) @@ -386,7 +386,7 @@ def test_punica_bgmv( @pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_punica_expand_nslices( - batchs: int, + batches: int, num_loras: int, rank: int, nslices: int, @@ -415,7 +415,7 @@ def test_punica_expand_nslices( seq_len_tensor, indices, ) = _generate_data_expand_nslices( - batchs, + batches, hidden_size, num_loras, rank, @@ -440,7 +440,7 @@ def test_punica_expand_nslices( b_seq_start_loc, seq_len_tensor, lora_indices_tensor, - batchs, + batches, max_seq_length, slice_offset, hidden_size, @@ -462,7 +462,7 @@ def test_punica_expand_nslices( lora_weights, lora_indices_tensor, seq_len_tensor, - batchs, + batches, 1.0, op_type="expand", ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 5a612788f4a6..0fe87d1ff2e8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -63,31 +63,6 @@ def dec(*args, **kwargs): return dec -def _apply_expand( - x: torch.Tensor, - lora_b_stacked: torch.Tensor, - lora_index_tensor: torch.Tensor, - indices_info: List[int], - output: torch.Tensor, - add_input: bool = True, -) -> torch.Tensor: - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - token_num = indices_info[0] - is_prefilling = bool(indices_info[5]) - add_expand( - output, - x, - lora_b_stacked, - lora_index_tensor[:token_num], - 0, - is_prefilling, - add_input, - ) - return output.view_as(org_output) - - def _apply_lora(x: torch.Tensor, lora_a_stacked: torch.Tensor, lora_b_stacked: torch.Tensor, @@ -118,9 +93,7 @@ def _apply_lora(x: torch.Tensor, org_output = output x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) - token_num = indices_info[0] - is_prefilling = bool(indices_info[5]) add_lora(output, x, @@ -386,12 +359,18 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - _apply_expand( + full_lora_a_embeddings = full_lora_a_embeddings.view( + -1, full_lora_a_embeddings.shape[-1]) + full_output = full_output.view(-1, full_output.shape[-1]) + token_num = self.indices_len[0] + is_prefilling = bool(self.indices_len[5]) + add_expand( + full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices, - self.indices_len, - full_output, + self.indices[:token_num], + 0, + is_prefilling, add_input=True, ) return full_output.view_as(full_output_org) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 32333f05b09d..24d95d6bb1b7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -79,9 +79,9 @@ def convert_mapping( requests to RoPE offsets and rot dims for long LoRAs. None if long context lora doesn't exist. indices_len: List of lengths of the above tensors and prefilling - flag.Used to index into each tensor. It contains - (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices,prefilling flag). + flag. Used to index into each tensor. It contains + (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices, long_lora_indices, prefilling flag). """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() @@ -448,7 +448,6 @@ def __init__( # Dict instead of a Set for compatibility with LRUCache. self._active_loras: Dict[int, None] = {} self._last_mapping: Optional[LoRAMapping] = None - self._convert_flag = True self._create_lora_modules() @property diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 998095c412e6..576559beeffe 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -102,7 +102,7 @@ def bgmv_expand( output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size + batches (int): batch size add_inputs (bool, optional): Defaults to False. adds the final lora results to the output. override_config (Optional[Dict[str, int]], optional): Defaults to None. @@ -137,14 +137,14 @@ def bgmv_expand( torch.bfloat16, ]: CAST_TYPE = True - batchs = lora_indices_tensor.size(0) + batches = lora_indices_tensor.size(0) if override_config: config = override_config else: - config = get_lora_op_configs("expand", batchs, N) + config = get_lora_op_configs("expand", batches, N) grid = lambda META: ( META["SPLIT_N"], - batchs, + batches, ) _bgmv_expand_kernel[grid]( inputs, diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 071dbe40f216..24f2b93f4bf2 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -108,7 +108,7 @@ def bgmv_expand_slice( corresponding to each batch slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size - batchs (int): batch size + batches (int): batch size add_inputs (bool, optional): Defaults to False. override_config (Optional[Dict[str, int]], optional): Defaults to None. Triton grid config @@ -149,16 +149,16 @@ def bgmv_expand_slice( ]: CAST_TYPE = True - batchs = lora_indices_tensor.size(0) + batches = lora_indices_tensor.size(0) if override_config: config = override_config else: - config = get_lora_op_configs("expand", batchs, N) + config = get_lora_op_configs("expand", batches, N) grid = lambda META: ( META["SPLIT_N"], - batchs, + batches, ) _bgmv_expand_slice_kernel[grid]( inputs, diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 3258a60d2455..85c36fd9ce04 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -94,7 +94,7 @@ def bgmv_shrink( output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size + batches (int): batch size scaling (float): Scaling factor. override_config (Optional[Dict[str, int]], optional): Defaults to None. Triton grid config @@ -116,18 +116,18 @@ def bgmv_shrink( assert lora_a_weights.is_contiguous() assert output_tensor.is_contiguous() # TODO tuning this config - batchs = lora_indices_tensor.size(0) + batches = lora_indices_tensor.size(0) N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank BLOCK_N = triton.next_power_of_2(N) if override_config: config = override_config else: # First try to load optimal config from the file - config = get_lora_op_configs("shrink", batchs, K) + config = get_lora_op_configs("shrink", batches, K) grid = lambda META: ( META["SPLIT_K"], - batchs, + batches, ) _bgmv_shrink_kernel[grid]( inputs, diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 879184db0b8b..f3a53b70f415 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -102,7 +102,7 @@ def sgmv_expand( b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, + batches: int, max_seq_length: int, add_inputs: bool = False, ): @@ -119,7 +119,7 @@ def sgmv_expand( length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size + batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch add_inputs (bool, optional): Defaults to False. adds the final lora @@ -132,8 +132,8 @@ def sgmv_expand( torch.bfloat16, ] assert inputs.size(1) == lora_b_weights.size(-1) - assert b_seq_start_loc.size(0) == batchs - assert lora_indices_tensor.size(0) == batchs + assert b_seq_start_loc.size(0) == batches + assert lora_indices_tensor.size(0) == batches assert inputs.is_contiguous() assert output_tensor.is_contiguous() @@ -161,7 +161,7 @@ def sgmv_expand( CAST_TYPE = True grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), - batchs, + batches, ] _sgmv_expand_kernel[grid]( inputs, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 000fef304823..52c71c5095b5 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -108,7 +108,7 @@ def sgmv_expand_slice( b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, + batches: int, max_seq_length: int, slice_offset: int, slice_size: int, @@ -128,7 +128,7 @@ def sgmv_expand_slice( length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size + batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch slice_offst (int): output_tensor's offst @@ -143,8 +143,8 @@ def sgmv_expand_slice( torch.bfloat16, ] assert inputs.size(1) == lora_b_weights.size(-1) - assert b_seq_start_loc.size(0) == batchs - assert lora_indices_tensor.size(0) == batchs + assert b_seq_start_loc.size(0) == batches + assert lora_indices_tensor.size(0) == batches assert slice_size == lora_b_weights.size(-2) assert inputs.is_contiguous() assert output_tensor.is_contiguous() @@ -173,7 +173,7 @@ def sgmv_expand_slice( CAST_TYPE = True grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), - batchs, + batches, ] _sgmv_expand_slice_kernel[grid]( inputs, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 45aeb9e9fb78..f295f0118f0b 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -105,7 +105,7 @@ def sgmv_shrink( b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, - batchs: int, + batches: int, max_seq_length: int, scaling: float, ): @@ -123,7 +123,7 @@ def sgmv_shrink( length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index corresponding to each batch - batchs (int): batch size + batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch scaling (float): Scaling factor. @@ -135,8 +135,8 @@ def sgmv_shrink( torch.bfloat16, ] assert inputs.size(1) == lora_a_weights.size(-1) - assert b_seq_start_loc.size(0) == batchs - assert lora_indices_tensor.size(0) == batchs + assert b_seq_start_loc.size(0) == batches + assert lora_indices_tensor.size(0) == batches assert inputs.is_contiguous() if lora_a_weights.ndim == 4: # shape:(lora_num,1,rank, size) @@ -156,7 +156,7 @@ def sgmv_shrink( grid = [ triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), SPLIT_K, - batchs, + batches, ] _sgmv_shrink_kernel[grid]( diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index 980dc8c6693f..78ba7c170353 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -8,11 +8,11 @@ def _get_config_file_name( op_type: str, - batchs: int, + batches: int, hidden_size: int, ) -> str: device_name = torch.cuda.get_device_name().replace(" ", "_") - return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " + + return (f"op_type={op_type},batches={batches},hidden_size={hidden_size} " + f"device_name={device_name}.json") @@ -29,7 +29,7 @@ def _get_op_configs(op_type: str, batch: int, hidden_size: int): if os.path.exists(config_file_path): with open(config_file_path) as f: tuned_config = json.load(f).get( - f"batchs={batch},hidden_size={hidden_size}", None) + f"batches={batch},hidden_size={hidden_size}", None) return tuned_config # If no optimized configuration is available, return None diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index e0b441e1dd08..aa96ba5f9240 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -44,8 +44,8 @@ def reset_params_cache(): cache explicitly """ #TODO release gpu memory - torch.cuda.empty_cache() _PARAMS_CACHE.clear() + torch.cuda.empty_cache() def _get_prefilling_params(token_lora_tensor: torch.Tensor, From 7eebe1c8514a1e765279d85c148cfcc235364733 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 1 Jul 2024 14:59:25 +0800 Subject: [PATCH 37/71] update docs --- vllm/lora/layers.py | 6 +++--- vllm/lora/ops/bgmv_shrink.py | 2 +- vllm/lora/ops/utils.py | 38 ++++++++---------------------------- 3 files changed, 12 insertions(+), 34 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 0fe87d1ff2e8..9ae7050157fe 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -359,9 +359,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - full_lora_a_embeddings = full_lora_a_embeddings.view( - -1, full_lora_a_embeddings.shape[-1]) - full_output = full_output.view(-1, full_output.shape[-1]) + # full_lora_a_embeddings = full_lora_a_embeddings.view( + # -1, full_lora_a_embeddings.shape[-1]) + # full_output = full_output.view(-1, full_output.shape[-1]) token_num = self.indices_len[0] is_prefilling = bool(self.indices_len[5]) add_expand( diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 85c36fd9ce04..6e3d90e2d235 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -123,7 +123,7 @@ def bgmv_shrink( config = override_config else: # First try to load optimal config from the file - config = get_lora_op_configs("shrink", batches, K) + config = get_lora_op_configs("bgmv_shrink", batches, K) grid = lambda META: ( META["SPLIT_K"], diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index 78ba7c170353..1755ac92b0d6 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -1,39 +1,10 @@ import functools -import json -import os from typing import Dict -import torch - - -def _get_config_file_name( - op_type: str, - batches: int, - hidden_size: int, -) -> str: - device_name = torch.cuda.get_device_name().replace(" ", "_") - return (f"op_type={op_type},batches={batches},hidden_size={hidden_size} " + - f"device_name={device_name}.json") - @functools.lru_cache def _get_op_configs(op_type: str, batch: int, hidden_size: int): - FOLDER_NAME = "bgmv_configs" - json_file_name = _get_config_file_name(op_type, batch, hidden_size) - - config_file_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - FOLDER_NAME, - json_file_name, - ) - if os.path.exists(config_file_path): - with open(config_file_path) as f: - tuned_config = json.load(f).get( - f"batches={batch},hidden_size={hidden_size}", None) - return tuned_config - - # If no optimized configuration is available, return None - return None + raise NotImplementedError def _get_default_config(op_type: str, batch: int, hidden_size: int): @@ -45,6 +16,13 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int): def get_lora_op_configs(op_type: str, batch: int, hidden_size: int) -> Dict[str, int]: + """Inspired by `fused_moe_kernel` + The return value will be a dictionary mapping an irregular grid of batch + sizes and hidden_size to configurations of the bgmv-related kernel. + NOTE: It currently only supports the default configuration. We plan to + generate optimal configurations for different hardware in the future using + scripts similar to `benchmark_moe.py`. + """ config = _get_op_configs(op_type, batch, hidden_size) if not config: config = _get_default_config(op_type, batch, hidden_size) From 8ac0331537bd096454f4b79398dc35a58913cf9d Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 1 Jul 2024 15:08:16 +0800 Subject: [PATCH 38/71] update docs --- vllm/lora/punica.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index aa96ba5f9240..bf1c8b5a9a6c 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -43,7 +43,6 @@ def reset_params_cache(): """At the beginning of the prefilling stage, we need clear the cache explicitly """ - #TODO release gpu memory _PARAMS_CACHE.clear() torch.cuda.empty_cache() From ea4b3cdff320883091c1cc914697932515a563e4 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 1 Jul 2024 15:49:10 +0800 Subject: [PATCH 39/71] update docs --- vllm/lora/punica.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index bf1c8b5a9a6c..c023ebc51eb6 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -18,7 +18,12 @@ def _compute_params( token_lora_tensor: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: """ - Get the information required for the sgmv kernel. + Get the information required for the sgmv kernel. With the features: + 1. If consecutive requests in the batch use the same LoRA, this function + will combine them into a single request, improving sgmv kernel inference + performance. + 2. At the beginning of each prefilling stage inference, recalculations are + needed based on the input, but only once. """ pointer = token_lora_tensor.data_ptr() if pointer not in _PARAMS_CACHE: From 4a13f27396bfbf071a1e9858fd845a2d1ac98486 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 1 Jul 2024 17:07:53 +0800 Subject: [PATCH 40/71] fix bug --- vllm/lora/ops/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index 1755ac92b0d6..6a637288f71e 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -4,7 +4,8 @@ @functools.lru_cache def _get_op_configs(op_type: str, batch: int, hidden_size: int): - raise NotImplementedError + # TODO: add optimal configurations + return None def _get_default_config(op_type: str, batch: int, hidden_size: int): From a10f8bc548fa792852cb570b997f53e87cfb9af2 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 1 Jul 2024 22:26:25 +0800 Subject: [PATCH 41/71] reformat --- vllm/lora/punica.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 65afca18a850..c023ebc51eb6 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -13,6 +13,7 @@ _PARAMS_CACHE: Dict[int, Tuple] = {} + def _compute_params( token_lora_tensor: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: From 3fb601676cf7d5eab6c2920c47473d88b050ecf9 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 2 Jul 2024 16:24:06 +0800 Subject: [PATCH 42/71] test lazy import --- vllm/lora/punica.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index c023ebc51eb6..a9b3040674d1 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,13 +4,6 @@ import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.sgmv_shrink import sgmv_shrink - _PARAMS_CACHE: Dict[int, Tuple] = {} @@ -73,6 +66,9 @@ def add_shrink( y=x@w_t_all When `is_prefilling` is True, will launch `sgmv_shrink` """ + from vllm.lora.ops.bgmv_shrink import bgmv_shrink + from vllm.lora.ops.sgmv_shrink import sgmv_shrink + if is_prefilling: ( b_seq_start_tensor, @@ -110,6 +106,8 @@ def add_expand( y+=x@w_t_all When `is_prefilling` is True, will launch `sgmv_expand`, """ + from vllm.lora.ops.bgmv_expand import bgmv_expand + from vllm.lora.ops.sgmv_expand import sgmv_expand if is_prefilling: ( b_seq_start_tensor, @@ -148,6 +146,8 @@ def add_expand_slice( """ y+=x@w_t_all """ + from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice + from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice if is_prefilling: ( b_seq_start_tensor, From e49a5dc136c8ac2d6368ec1c77964acd1e9d5558 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 3 Jul 2024 23:27:11 +0800 Subject: [PATCH 43/71] merge --- vllm/lora/punica.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 2dd464552dfb..c023ebc51eb6 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,22 +4,16 @@ import torch -<<<<<<< HEAD from vllm.lora.ops.bgmv_expand import bgmv_expand from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.bgmv_shrink import bgmv_shrink from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink -======= -from vllm import _custom_ops as ops -from vllm.platforms import current_platform ->>>>>>> origin/main _PARAMS_CACHE: Dict[int, Tuple] = {} -<<<<<<< HEAD def _compute_params( token_lora_tensor: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: @@ -48,16 +42,6 @@ def _compute_params( max_length, ) return _PARAMS_CACHE[pointer] -======= - if current_platform.get_device_capability() < (8, 0): - raise ImportError( - "punica LoRA kernels require compute capability >= 8.0") - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") ->>>>>>> origin/main def reset_params_cache(): From 66dd88f41c7caf5ad940ed6017701b345a2a79e4 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 3 Jul 2024 23:34:53 +0800 Subject: [PATCH 44/71] merge main --- vllm/lora/punica.py | 14 +++++++------- vllm/worker/model_runner.py | 7 ++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index a9b3040674d1..c023ebc51eb6 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,6 +4,13 @@ import torch +from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice +from vllm.lora.ops.sgmv_shrink import sgmv_shrink + _PARAMS_CACHE: Dict[int, Tuple] = {} @@ -66,9 +73,6 @@ def add_shrink( y=x@w_t_all When `is_prefilling` is True, will launch `sgmv_shrink` """ - from vllm.lora.ops.bgmv_shrink import bgmv_shrink - from vllm.lora.ops.sgmv_shrink import sgmv_shrink - if is_prefilling: ( b_seq_start_tensor, @@ -106,8 +110,6 @@ def add_expand( y+=x@w_t_all When `is_prefilling` is True, will launch `sgmv_expand`, """ - from vllm.lora.ops.bgmv_expand import bgmv_expand - from vllm.lora.ops.sgmv_expand import sgmv_expand if is_prefilling: ( b_seq_start_tensor, @@ -146,8 +148,6 @@ def add_expand_slice( """ y+=x@w_t_all """ - from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice - from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice if is_prefilling: ( b_seq_start_tensor, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 16fc21ef3207..de550d472082 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1032,11 +1032,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: ) if self.lora_config: - lora_mapping = LoRAMapping( - [0] * batch_size, - [0] * batch_size, - False - ) + lora_mapping = LoRAMapping([0] * batch_size, + [0] * batch_size, False) self.set_active_loras(set(), lora_mapping) graph_runner = CUDAGraphRunner( From 0cedeb34e533ba2e936fb607d433eed12c0bd3d0 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 4 Jul 2024 16:51:56 +0800 Subject: [PATCH 45/71] modify punica --- tests/lora/test_triton_punica.py | 11 +- vllm/lora/fully_sharded_layers.py | 24 +-- vllm/lora/layers.py | 25 ++- vllm/lora/models.py | 12 +- vllm/lora/punica.py | 264 ++++++++++++++++++------------ 5 files changed, 192 insertions(+), 144 deletions(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index eea190b153a1..9bbc529188d8 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -62,11 +62,10 @@ 128256, ] -batches = [1, 2, 4] + [8 * i for i in range(1, 4)] - +BATCHES = [1, 2, 4] + [8 * i for i in range(1, 4)] NUM_LORA = [1, 4, 8, 16, 32, 64, 128] DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [8, 16, 32, 64] +MAX_RANKS = [1] SCALES = [0.5] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] @@ -222,7 +221,7 @@ def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank, raise error -@pytest.mark.parametrize("batches", batches) +@pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @@ -306,7 +305,7 @@ def test_punica_sgmv( assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.parametrize("batches", batches) +@pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("scaling", SCALES) @@ -377,7 +376,7 @@ def test_punica_bgmv( assert_close(our_out_tensor, ref_out_tensor) -@pytest.mark.parametrize("batches", batches) +@pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("nslices", [2, 3]) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 3ff69c930247..d21649bed5d6 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -65,7 +65,7 @@ def apply(self, x: torch.Tensor, device=x.device, ) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[5]) + is_prefill = bool(self.indices_len[5]) add_shrink( buffer, x, @@ -73,7 +73,7 @@ def apply(self, x: torch.Tensor, self.indices[:token_num], 0, 1.0, - is_prefilling, + is_prefill, ) buffer = tensor_model_parallel_all_gather(buffer) add_expand( @@ -82,7 +82,7 @@ def apply(self, x: torch.Tensor, self.lora_b_stacked, self.indices[:token_num], 0, - is_prefilling, + is_prefill, add_input=True, ) # now have column partitioned output @@ -130,7 +130,7 @@ def _mcp_apply(x, bias, layer): device=x.device, ) token_num = layer.indices_len[0] - is_prefilling = bool(layer.indices_len[5]) + is_prefill = bool(layer.indices_len[5]) for idx in range(n): add_shrink( @@ -140,7 +140,7 @@ def _mcp_apply(x, bias, layer): layer.indices[:token_num], 0, 1.0, - is_prefilling, + is_prefill, ) buffers = tensor_model_parallel_all_gather(buffers) @@ -153,7 +153,7 @@ def _mcp_apply(x, bias, layer): layer.lora_b_stacked[idx], layer.indices[:token_num], 0, - is_prefilling, + is_prefill, left_offset, shard_size, add_input=True, @@ -239,10 +239,10 @@ def apply(self, x: torch.Tensor, device=x.device) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[5]) + is_prefill = bool(self.indices_len[5]) add_shrink(buffer, x, self.lora_a_stacked, self.indices[:token_num], 0, - 1.0, is_prefilling) + 1.0, is_prefill) buffer = tensor_model_parallel_all_gather(buffer) add_expand(output, @@ -250,7 +250,7 @@ def apply(self, x: torch.Tensor, self.lora_b_stacked, self.indices[:token_num], 0, - is_prefilling, + is_prefill, add_input=True) # now have column partitioned output @@ -346,7 +346,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: device=x.device, ) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[5]) + is_prefill = bool(self.indices_len[5]) add_shrink( buffer, x, @@ -354,7 +354,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: self.indices[:token_num], 0, 1.0, - is_prefilling, + is_prefill, ) buffer = tensor_model_parallel_all_reduce(buffer) @@ -372,7 +372,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: self.lora_b_stacked, self.indices[:self.indices_len[0]], 0, - is_prefilling, + is_prefill, start_idx, shard_size, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 9cb122ee9547..632eb75f9699 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -83,7 +83,7 @@ def _apply_lora(x: torch.Tensor, index corresponding to each token indices_len(List):(6,), It contains (base_indices, sampler_indices, sampler_indices_padded,embeddings_indices, long_lora_indices, - prefilling flag). + prefill flag). output (torch.Tensor): (batch_size, output_dim) Returns: @@ -94,7 +94,7 @@ def _apply_lora(x: torch.Tensor, x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) token_num = indices_info[0] - is_prefilling = bool(indices_info[5]) + is_prefill = bool(indices_info[5]) add_lora(output, x, lora_a_stacked, @@ -102,7 +102,7 @@ def _apply_lora(x: torch.Tensor, lora_index_tensor[:token_num], 0, 1.0, - is_prefilling, + is_prefill, cache_clear=cache_clear) return output.view_as(org_output) @@ -127,7 +127,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor, output = output.view(-1, output.shape[-1]) token_num = indices_info[0] - is_prefilling = bool(indices_info[5]) + is_prefill = bool(indices_info[5]) offset_left = 0 # TODO fuse these kernels for slice_idx in range(len(output_slices)): @@ -138,7 +138,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor, lora_index_tensor[:token_num], 0, 1.0, - is_prefilling, + is_prefill, offset_left, output_slices[slice_idx], cache_clear=cache_clear) @@ -153,8 +153,8 @@ class LoRAMapping: index_mapping: Tuple[int, ...] # Per sampled token: prompt_mapping: Tuple[int, ...] - # prefilling or decoding. - is_prefilling: bool = False + # prefill stage or decode stage. + is_prefill: bool = False def __post_init__(self): self.index_mapping = tuple(self.index_mapping) @@ -363,14 +363,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # -1, full_lora_a_embeddings.shape[-1]) # full_output = full_output.view(-1, full_output.shape[-1]) token_num = self.indices_len[0] - is_prefilling = bool(self.indices_len[5]) + is_prefill = bool(self.indices_len[5]) add_expand( full_output, full_lora_a_embeddings, self.lora_b_stacked, self.indices[:token_num], 0, - is_prefilling, + is_prefill, add_input=True, ) return full_output.view_as(full_output_org) @@ -1297,10 +1297,9 @@ def _get_logits( self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1], ] = lora_logits - # LogitsProcessorWithLoRA always using bgmv - # sampler_indices sampler_indices = self.indices_len[1] - is_prefilling = False + # LogitsProcessorWithLoRA always using bgmv + is_prefill = False add_lora( logits, hidden_states, @@ -1309,7 +1308,7 @@ def _get_logits( self.indices[:sampler_indices], 0, 1.0, - is_prefilling, + is_prefill, ) # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 4ef8f6115f0f..5324d50380dc 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -78,10 +78,10 @@ def convert_mapping( long_lora_indices: Tensor of shape [batch_size] mapping requests to RoPE offsets and rot dims for long LoRAs. None if long context lora doesn't exist. - indices_len: List of lengths of the above tensors and prefilling + indices_len: List of lengths of the above tensors and prefill flag. Used to index into each tensor. It contains (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices, prefilling flag). + embeddings_indices, long_lora_indices, prefill flag). """ index_mapping_indices: List[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() @@ -149,9 +149,9 @@ def convert_mapping( else: #If long_lora doesn't exist,append None indices_len.append(None) - # Append a prefilling flag to help selecting the appropriate lora + # Append a prefill flag to help selecting the appropriate lora # ops (sgmv or bgmv) - indices_len.append(int(mapping.is_prefilling)) + indices_len.append(int(mapping.is_prefill)) return ( base_indices, sampler_indices, @@ -458,7 +458,7 @@ def __init__( self.scaling_factor_to_offset: Dict[float, int] = {} # 6 is the number of indicies tensors. # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,long_lora_indices,prefilling or decoding + # embeddings_indices,long_lora_indices,prefill or decode stage self.indices_len: List[Optional[int]] = [None] * 6 self.model = model @@ -622,7 +622,7 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: # Maintain the reference self.indices_len[:] = indices_len # - if mapping.is_prefilling: + if mapping.is_prefill: punica.reset_params_cache() punica._compute_params(self.base_indices[:base_indices.shape[0]]) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index c023ebc51eb6..9474744ade2e 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -22,7 +22,7 @@ def _compute_params( 1. If consecutive requests in the batch use the same LoRA, this function will combine them into a single request, improving sgmv kernel inference performance. - 2. At the beginning of each prefilling stage inference, recalculations are + 2. At the beginning of each prefill stage inference, recalculations are needed based on the input, but only once. """ pointer = token_lora_tensor.data_ptr() @@ -45,20 +45,123 @@ def _compute_params( def reset_params_cache(): - """At the beginning of the prefilling stage, we need clear the + """At the beginning of the prefill stage, we need clear the cache explicitly """ _PARAMS_CACHE.clear() torch.cuda.empty_cache() -def _get_prefilling_params(token_lora_tensor: torch.Tensor, - cache_clear: bool = False): +def _get_prefill_params(token_lora_tensor: torch.Tensor, + cache_clear: bool = False): if cache_clear: reset_params_cache() return _compute_params(token_lora_tensor) +def shrink_prefill( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + scale: float, + cache_clear: bool = False, +): + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefill_params(lora_indices_tensor, cache_clear) + sgmv_shrink( + x, + w_t_all, + y, + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + scale, + ) + + +def shrink_decode( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + scale: float, +): + bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) + + +def expand_prefill( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + add_input: bool, + cache_clear: bool = False, +): + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefill_params(lora_indices_tensor, cache_clear) + sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, + last_lora_indices_tensor, batch_size, max_length, add_input) + + +def expand_decode( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + add_input: bool, +): + bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_input) + + +def expand_slice_prefill( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, + layer_idx: int, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool, + cache_clear: bool = False, +): + ( + b_seq_start_tensor, + seq_length_tensor, + last_lora_indices_tensor, + batch_size, + max_length, + ) = _get_prefill_params(lora_indices_tensor, cache_clear) + sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, + last_lora_indices_tensor, batch_size, max_length, + y_offset, y_slice_size, add_input) + + +def expand_slice_decode(y: torch.Tensor, x: torch.Tensor, + w_t_all: torch.Tensor, + lora_indices_tensor: torch.Tensor, layer_idx: int, + y_offset: Optional[int], y_slice_size: Optional[int], + add_input: bool): + bgmv_expand_slice(x, w_t_all, y, lora_indices_tensor, y_offset, + y_slice_size, add_input) + + def add_shrink( y: torch.Tensor, x: torch.Tensor, @@ -66,34 +169,22 @@ def add_shrink( lora_indices_tensor: torch.Tensor, layer_idx: int, scale: float, - is_prefilling: bool, + is_prefill: bool, cache_clear: bool = False, ): """ - y=x@w_t_all - When `is_prefilling` is True, will launch `sgmv_shrink` + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the shrink_decode function + should be called. """ - if is_prefilling: - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = _get_prefilling_params(lora_indices_tensor, cache_clear) - sgmv_shrink( - x, - w_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - scale, - ) + if is_prefill: + shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale, + cache_clear) else: - bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) + shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale) def add_expand( @@ -102,35 +193,23 @@ def add_expand( w_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, layer_idx: int, - is_prefilling: bool, + is_prefill: bool, add_input: bool = True, cache_clear: bool = False, ): """ - y+=x@w_t_all - When `is_prefilling` is True, will launch `sgmv_expand`, + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'b. + When `is_prefill` is true, it indicates that it is currently the + prefill stage, and the `expand_prefill` function should be called. + Otherwise, it is the decode stage, and the expand_decode function + should be called. """ - if is_prefilling: - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = _get_prefilling_params(lora_indices_tensor, cache_clear) - sgmv_expand( - x, - w_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - add_input, - ) + if is_prefill: + expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, + add_input, cache_clear) else: - bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input) + expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input) def add_expand_slice( @@ -139,46 +218,21 @@ def add_expand_slice( w_t_all: torch.Tensor, lora_indices_tensor: torch.Tensor, layer_idx: int, - is_prefilling: bool, + is_prefill: bool, y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool = True, cache_clear: bool = False, ): """ - y+=x@w_t_all + Similar to `add_expand` """ - if is_prefilling: - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = _get_prefilling_params(lora_indices_tensor, cache_clear) - sgmv_expand_slice( - x, - w_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - y_offset, - y_slice_size, - add_input, - ) + if is_prefill: + expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, + y_offset, y_slice_size, add_input, cache_clear) else: - bgmv_expand_slice( - x, - w_t_all, - y, - lora_indices_tensor, - y_offset, - y_slice_size, - add_inputs=add_input, - ) + expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, + y_offset, y_slice_size, add_input) def add_lora( @@ -189,7 +243,7 @@ def add_lora( lora_indices_tensor: torch.Tensor, layer_idx: int, scale: float, - is_prefilling: bool, + is_prefill: bool, y_offset: Optional[int] = None, y_slice_size: Optional[int] = None, *, @@ -212,7 +266,7 @@ def add_lora( lora_indices_tensor (torch.Tensor): _description_ layer_idx (int): Layer index of LoRA weights. scale (float): Scaling factor. - is_prefilling (bool): prefiling stage + is_prefill (bool): prefiling stage y_offset (Optional[int], optional): Offset to apply to the starting column of y. y_slice_size (Optional[int], optional): Size of the y column slice.. @@ -235,30 +289,26 @@ def add_lora( lora_indices_tensor, 0, scale, - is_prefilling, + is_prefill, cache_clear=cache_clear, ) if y_offset is None and y_slice_size is None: - add_expand( - y, - buffer, - wb_t_all, - lora_indices_tensor, - 0, - is_prefilling, - add_input=True, - cache_clear=cache_clear, - ) + add_expand(y, + buffer, + wb_t_all, + lora_indices_tensor, + 0, + is_prefill, + add_input=True, + cache_clear=cache_clear) else: - add_expand_slice( - y, - buffer, - wb_t_all, - lora_indices_tensor, - 0, - is_prefilling, - y_offset, - y_slice_size, - add_input=True, - cache_clear=cache_clear, - ) + add_expand_slice(y, + buffer, + wb_t_all, + lora_indices_tensor, + 0, + is_prefill, + y_offset, + y_slice_size, + add_input=True, + cache_clear=cache_clear) From 59d17f457ca7cb46cd16fdeb512597096f2ee385 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 9 Jul 2024 00:22:52 +0800 Subject: [PATCH 46/71] refactor sgmv metadata --- tests/lora/test_lora.py | 14 +-- tests/lora/test_triton_punica.py | 4 +- vllm/lora/layers.py | 10 +- vllm/lora/models.py | 14 ++- vllm/lora/punica.py | 166 +++++++++++++++++++++---------- 5 files changed, 137 insertions(+), 71 deletions(-) diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 51708c8fa6e5..a4ca7a93e62e 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -20,7 +20,7 @@ torch.bfloat16: (3e-2, 2e-2), } -STAGES = [0, 1] #prefilling(1) or decoding(0) +STAGES = [0, 1] #prefill stage(1) or decode stage(0) @pytest.mark.parametrize("m", TENSOR_SIZES) @@ -68,7 +68,7 @@ def test_apply_lora(m, n, k, rank, dtype, stage) -> None: device="cuda"), indices_info, output, - cache_clear=True) + need_update=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -80,7 +80,7 @@ def test_apply_lora(m, n, k, rank, dtype, stage) -> None: torch.full((len(input), ), -1, device="cuda"), indices_info, output, - cache_clear=True) + need_update=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -149,7 +149,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None: device="cuda"), indices_info, output, (m // 2, m // 2), - cache_clear=True) + need_update=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -161,7 +161,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None: torch.full((len(input), ), -1, device="cuda"), indices_info, output, (m // 2, m // 2), - cache_clear=True) + need_update=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -245,7 +245,7 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None: device="cuda"), indices_info, output, (qkv[0], qkv[1], qkv[2]), - cache_clear=True) + need_update=True) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -257,7 +257,7 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None: torch.full((len(input), ), -1, device="cuda"), indices_info, output, (qkv[0], qkv[1], qkv[2]), - cache_clear=True) + need_update=True) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 9bbc529188d8..3ed2f032241e 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -62,10 +62,10 @@ 128256, ] -BATCHES = [1, 2, 4] + [8 * i for i in range(1, 4)] +BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)] NUM_LORA = [1, 4, 8, 16, 32, 64, 128] DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [1] +MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] SCALES = [0.5] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 632eb75f9699..57b0eb2347d8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -69,7 +69,7 @@ def _apply_lora(x: torch.Tensor, lora_index_tensor: torch.Tensor, indices_info: List[int], output: torch.Tensor, - cache_clear: bool = False) -> torch.Tensor: + need_update: bool = False) -> torch.Tensor: """Applies lora to each input. This method applies all loras to each input. It uses the `lora_index_tensor` vector to determine which lora yields the correct output. An index of -1 means no lora should be @@ -85,6 +85,8 @@ def _apply_lora(x: torch.Tensor, sampler_indices_padded,embeddings_indices, long_lora_indices, prefill flag). output (torch.Tensor): (batch_size, output_dim) + need_update (bool, optional): Indicates whether updating sgmv metadata + is needed. Defaults to False. Returns: output (torch.Tensor): (batch_size*seq_number, output_dim) @@ -103,7 +105,7 @@ def _apply_lora(x: torch.Tensor, 0, 1.0, is_prefill, - cache_clear=cache_clear) + need_update=need_update) return output.view_as(org_output) @@ -116,7 +118,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor, indices_info: List[int], output: torch.Tensor, output_slices: Tuple[int, ...], - cache_clear: bool = False) -> torch.Tensor: + need_update: bool = False) -> torch.Tensor: """ Applies lora to each input. Similar to _apply_lora, This method is used for layers that are composed of multiple sublayers @@ -141,7 +143,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor, is_prefill, offset_left, output_slices[slice_idx], - cache_clear=cache_clear) + need_update=need_update) offset_left += output_slices[slice_idx] return output.view_as(org_output) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5324d50380dc..e18fdaa00e9a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -12,11 +12,11 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger -from vllm.lora import punica from vllm.lora.layers import (BaseLayerWithLoRA, LinearScalingRotaryEmbeddingWithLora, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.punica import PrefillHelper from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA @@ -26,6 +26,9 @@ _GLOBAL_LORA_ID = 0 +# NOTE This value comes fromllm/worker/model_runner.py +_MAX_BATCH_SIZE = 256 + @dataclass class LongContextLoRAContext: @@ -460,6 +463,9 @@ def __init__( # base_indices, sampler_indices, sampler_indices_padded, # embeddings_indices,long_lora_indices,prefill or decode stage self.indices_len: List[Optional[int]] = [None] * 6 + self.prefill_helper = PrefillHelper(max_batches=_MAX_BATCH_SIZE, + device=str( + self.base_indices.device)) self.model = model if hasattr(self.model, "supported_lora_modules"): @@ -621,10 +627,9 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: self.long_lora_indices.zero_() # Maintain the reference self.indices_len[:] = indices_len - # if mapping.is_prefill: - punica.reset_params_cache() - punica._compute_params(self.base_indices[:base_indices.shape[0]]) + self.prefill_helper.get_metadata( + self.base_indices[:base_indices.shape[0]], need_update=True) def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: @@ -643,7 +648,6 @@ def remove_all_loras(self): self._registered_loras.clear() self.lora_index_to_id = [None] * self.lora_slots self._active_loras.clear() - punica.reset_params_cache() def _create_lora_modules(self): for module_name, module in self.model.named_modules( diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 9474744ade2e..5347a68852a0 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -1,6 +1,11 @@ -# Based on code from https://github.com/punica-ai/punica +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" -from typing import Dict, Optional, Tuple +from typing import Optional, Tuple import torch @@ -11,12 +16,10 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink -_PARAMS_CACHE: Dict[int, Tuple] = {} - -def _compute_params( +def _compute_meta( token_lora_tensor: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: """ Get the information required for the sgmv kernel. With the features: 1. If consecutive requests in the batch use the same LoRA, this function @@ -25,38 +28,94 @@ def _compute_params( 2. At the beginning of each prefill stage inference, recalculations are needed based on the input, but only once. """ - pointer = token_lora_tensor.data_ptr() - if pointer not in _PARAMS_CACHE: - lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( - token_lora_tensor, return_counts=True) - cum_result = torch.cumsum(seq_length_tensor, dim=0) - b_seq_start_tensor = torch.zeros_like(seq_length_tensor) - b_seq_start_tensor[1:].copy_(cum_result[:-1]) - max_length = seq_length_tensor.max().item() - batch_size = lora_indices_tensor.size(0) - _PARAMS_CACHE[pointer] = ( - b_seq_start_tensor, - seq_length_tensor, - lora_indices_tensor, - batch_size, - max_length, - ) - return _PARAMS_CACHE[pointer] - - -def reset_params_cache(): - """At the beginning of the prefill stage, we need clear the - cache explicitly - """ - _PARAMS_CACHE.clear() - torch.cuda.empty_cache() + + lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( + token_lora_tensor, return_counts=True) + cum_result = torch.cumsum(seq_length_tensor, dim=0) + b_seq_start_tensor = torch.zeros_like(seq_length_tensor) + b_seq_start_tensor[1:].copy_(cum_result[:-1]) + max_length = seq_length_tensor.max().item() + batch_size = lora_indices_tensor.size(0) + return ( + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + ) -def _get_prefill_params(token_lora_tensor: torch.Tensor, - cache_clear: bool = False): - if cache_clear: - reset_params_cache() - return _compute_params(token_lora_tensor) +class PrefillHelper: + """PrefillHelper is designed to manage and provide metadata for the sgmv + kernel during prefill stage, utilizing the singleton pattern to guarantee + the existence of only one instance of the class. + """ + _instance: Optional["PrefillHelper"] = None + initialized: bool + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.initialized = False + return cls._instance + + def __init__(self, max_batches: int = 256, device: str = "cuda"): + """ + Args: + max_batches (int, optional): the maximum batch to pre-allocate. + Defaults to 256. + device (str, optional): Defaults to "cuda". + """ + if not self.initialized: + self.initialized = True + # these attributes are the information required for sgmv kernel + self.b_seq_start_tensor = torch.zeros(max_batches, + dtype=torch.long, + device=device) + self.seq_length_tensor = torch.empty(max_batches, + dtype=torch.long, + device=device) + self.lora_indices_tensor = torch.empty(max_batches, + dtype=torch.long, + device=device) + self.max_length: int = 0 + self.batch_size: int = -1 + + def _update_metada(self, token_lora_tensor: torch.Tensor) -> None: + + (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, + batch_size, max_length) = _compute_meta(token_lora_tensor) + + self.b_seq_start_tensor[:b_seq_start_tensor.shape[0]].copy_( + b_seq_start_tensor) + self.seq_length_tensor[:seq_length_tensor.shape[0]].copy_( + seq_length_tensor) + self.lora_indices_tensor[:lora_indices_tensor.shape[0]].copy_( + lora_indices_tensor) + self.batch_size = batch_size + self.max_length = max_length + + def get_metadata( + self, + token_lora_tensor: torch.Tensor, + need_update: bool = False + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: + + #Need to recalculate and fill metadata. + if need_update: + self._update_metada(token_lora_tensor) + + return (self.b_seq_start_tensor[:self.batch_size], + self.seq_length_tensor[:self.batch_size], + self.lora_indices_tensor[:self.batch_size], self.batch_size, + self.max_length) + + +def get_prefill_meta(token_lora_tensor: torch.Tensor, + need_update: bool = False): + prefill_helper = PrefillHelper(max_batches=256, + device=str(token_lora_tensor.device)) + return prefill_helper.get_metadata(token_lora_tensor, need_update) def shrink_prefill( @@ -66,7 +125,7 @@ def shrink_prefill( lora_indices_tensor: torch.Tensor, layer_idx: int, scale: float, - cache_clear: bool = False, + need_update: bool = False, ): ( b_seq_start_tensor, @@ -74,7 +133,7 @@ def shrink_prefill( last_lora_indices_tensor, batch_size, max_length, - ) = _get_prefill_params(lora_indices_tensor, cache_clear) + ) = get_prefill_meta(lora_indices_tensor, need_update) sgmv_shrink( x, w_t_all, @@ -106,7 +165,7 @@ def expand_prefill( lora_indices_tensor: torch.Tensor, layer_idx: int, add_input: bool, - cache_clear: bool = False, + need_update: bool = False, ): ( b_seq_start_tensor, @@ -114,7 +173,7 @@ def expand_prefill( last_lora_indices_tensor, batch_size, max_length, - ) = _get_prefill_params(lora_indices_tensor, cache_clear) + ) = get_prefill_meta(lora_indices_tensor, need_update) sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, last_lora_indices_tensor, batch_size, max_length, add_input) @@ -139,7 +198,7 @@ def expand_slice_prefill( y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool, - cache_clear: bool = False, + need_update: bool = False, ): ( b_seq_start_tensor, @@ -147,7 +206,7 @@ def expand_slice_prefill( last_lora_indices_tensor, batch_size, max_length, - ) = _get_prefill_params(lora_indices_tensor, cache_clear) + ) = get_prefill_meta(lora_indices_tensor, need_update) sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, last_lora_indices_tensor, batch_size, max_length, y_offset, y_slice_size, add_input) @@ -170,7 +229,7 @@ def add_shrink( layer_idx: int, scale: float, is_prefill: bool, - cache_clear: bool = False, + need_update: bool = False, ): """ Perform the ` y+=x@w_t_all` computation, which is suitable for the @@ -182,7 +241,7 @@ def add_shrink( """ if is_prefill: shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale, - cache_clear) + need_update) else: shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale) @@ -195,7 +254,7 @@ def add_expand( layer_idx: int, is_prefill: bool, add_input: bool = True, - cache_clear: bool = False, + need_update: bool = False, ): """ Perform the ` y+=x@w_t_all` computation, which is suitable for the @@ -207,7 +266,7 @@ def add_expand( """ if is_prefill: expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, - add_input, cache_clear) + add_input, need_update) else: expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input) @@ -222,14 +281,14 @@ def add_expand_slice( y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool = True, - cache_clear: bool = False, + need_update: bool = False, ): """ Similar to `add_expand` """ if is_prefill: expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, - y_offset, y_slice_size, add_input, cache_clear) + y_offset, y_slice_size, add_input, need_update) else: expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, y_offset, y_slice_size, add_input) @@ -248,7 +307,7 @@ def add_lora( y_slice_size: Optional[int] = None, *, buffer: Optional[torch.Tensor] = None, - cache_clear: bool = False, + need_update: bool = False, ): """ Semantics: @@ -271,7 +330,8 @@ def add_lora( column of y. y_slice_size (Optional[int], optional): Size of the y column slice.. buffer (Optional[torch.Tensor], optional): Defaults to None. - cache_clear (bool, optional): Defaults to False. + need_update (bool, optional): Indicates whether updating sgmv metadata + is needed. Defaults to False. """ r = wb_t_all.size(-1) @@ -290,7 +350,7 @@ def add_lora( 0, scale, is_prefill, - cache_clear=cache_clear, + need_update=need_update, ) if y_offset is None and y_slice_size is None: add_expand(y, @@ -300,7 +360,7 @@ def add_lora( 0, is_prefill, add_input=True, - cache_clear=cache_clear) + need_update=need_update) else: add_expand_slice(y, buffer, @@ -311,4 +371,4 @@ def add_lora( y_offset, y_slice_size, add_input=True, - cache_clear=cache_clear) + need_update=need_update) From 46486972eb2fac8da50eaa4c26d01dfdf7924c35 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 9 Jul 2024 16:02:49 +0800 Subject: [PATCH 47/71] fix typo --- vllm/lora/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e18fdaa00e9a..cf26fe66ff41 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -26,7 +26,7 @@ _GLOBAL_LORA_ID = 0 -# NOTE This value comes fromllm/worker/model_runner.py +# NOTE This value comes from vllm/worker/model_runner.py _MAX_BATCH_SIZE = 256 From 8732c76e30c6485e04b08edc6768b6d1dffe7eab Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 10 Jul 2024 16:46:56 +0800 Subject: [PATCH 48/71] refactor punica wrapper --- vllm/lora/fully_sharded_layers.py | 95 +--- vllm/lora/layers.py | 283 ++-------- vllm/lora/models.py | 192 +------ vllm/lora/ops/sgmv_expand.py | 4 +- vllm/lora/ops/sgmv_expand_slice.py | 4 +- vllm/lora/ops/sgmv_shrink.py | 4 +- vllm/lora/punica.py | 827 ++++++++++++++++++----------- 7 files changed, 589 insertions(+), 820 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index d21649bed5d6..cae7d593f123 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -14,7 +14,8 @@ MergedQKVParallelLinearWithLora, QKVParallelLinearWithLora, RowParallelLinearWithLoRA) -from vllm.lora.punica import add_expand, add_expand_slice, add_shrink + +# from vllm.lora.punica import add_expand, add_expand_slice, add_shrink if TYPE_CHECKING: pass @@ -64,27 +65,12 @@ def apply(self, x: torch.Tensor, dtype=torch.float32, device=x.device, ) - token_num = self.indices_len[0] - is_prefill = bool(self.indices_len[5]) - add_shrink( - buffer, - x, - self.lora_a_stacked, - self.indices[:token_num], - 0, - 1.0, - is_prefill, - ) + self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_gather(buffer) - add_expand( - output, - buffer, - self.lora_b_stacked, - self.indices[:token_num], - 0, - is_prefill, - add_input=True, - ) + self.punica_wrapper.add_expand(output, + buffer, + self.lora_b_stacked, + add_input=True) # now have column partitioned output output = output.view(*out_orig_shape) return output @@ -108,7 +94,7 @@ def can_replace_layer( ) -def _mcp_apply(x, bias, layer): +def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): """ MergedColumnParallelLinearWithShardedLoRA and QKVParallelLinearWithShardedLora share the same @@ -129,31 +115,18 @@ def _mcp_apply(x, bias, layer): dtype=torch.float32, device=x.device, ) - token_num = layer.indices_len[0] - is_prefill = bool(layer.indices_len[5]) for idx in range(n): - - add_shrink( - buffers[idx], - x, - layer.lora_a_stacked[idx], - layer.indices[:token_num], - 0, - 1.0, - is_prefill, - ) + layer.punica_wrapper.add_shrink(buffers[idx], x, + layer.lora_a_stacked[idx], 1.0) buffers = tensor_model_parallel_all_gather(buffers) left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] - add_expand_slice( + layer.punica_wrapper.add_expand_slice( output, buffers[idx], layer.lora_b_stacked[idx], - layer.indices[:token_num], - 0, - is_prefill, left_offset, shard_size, add_input=True, @@ -237,23 +210,13 @@ def apply(self, x: torch.Tensor, buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), dtype=torch.float32, device=x.device) - - token_num = self.indices_len[0] - is_prefill = bool(self.indices_len[5]) - - add_shrink(buffer, x, self.lora_a_stacked, self.indices[:token_num], 0, - 1.0, is_prefill) + self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_gather(buffer) - - add_expand(output, - buffer, - self.lora_b_stacked, - self.indices[:token_num], - 0, - is_prefill, - add_input=True) + self.punica_wrapper.add_expand(output, + buffer, + self.lora_b_stacked, + add_input=True) # now have column partitioned output - output = output.view(*out_orig_shape) return output @@ -345,17 +308,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: dtype=torch.float32, device=x.device, ) - token_num = self.indices_len[0] - is_prefill = bool(self.indices_len[5]) - add_shrink( - buffer, - x, - self.lora_a_stacked, - self.indices[:token_num], - 0, - 1.0, - is_prefill, - ) + + self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_reduce(buffer) # following S-LoRA, allows the fusing of all_gather and all_reduce @@ -366,16 +320,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size - add_expand_slice( - output, - buffer, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - 0, - is_prefill, - start_idx, - shard_size, - ) + self.punica_wrapper.add_expand_slice(output, buffer, + self.lora_b_stacked, start_idx, + shard_size) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 57b0eb2347d8..16d086f2e8a1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -16,8 +16,7 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -# from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.punica import add_expand, add_lora +from vllm.lora.punica import PunicaWrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -63,92 +62,6 @@ def dec(*args, **kwargs): return dec -def _apply_lora(x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - lora_index_tensor: torch.Tensor, - indices_info: List[int], - output: torch.Tensor, - need_update: bool = False) -> torch.Tensor: - """Applies lora to each input. This method applies all loras to each - input. It uses the `lora_index_tensor` vector to determine which lora - yields the correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the output. - - Args: - x (torch.Tensor): (batch_size, hidden_dim) - lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim) - lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank) - lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA - index corresponding to each token - indices_len(List):(6,), It contains (base_indices, sampler_indices, - sampler_indices_padded,embeddings_indices, long_lora_indices, - prefill flag). - output (torch.Tensor): (batch_size, output_dim) - need_update (bool, optional): Indicates whether updating sgmv metadata - is needed. Defaults to False. - - Returns: - output (torch.Tensor): (batch_size*seq_number, output_dim) - - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - token_num = indices_info[0] - is_prefill = bool(indices_info[5]) - add_lora(output, - x, - lora_a_stacked, - lora_b_stacked, - lora_index_tensor[:token_num], - 0, - 1.0, - is_prefill, - need_update=need_update) - return output.view_as(org_output) - - -def _apply_lora_packed_nslice(x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, - torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, - torch.Tensor], - lora_index_tensor: torch.Tensor, - indices_info: List[int], - output: torch.Tensor, - output_slices: Tuple[int, ...], - need_update: bool = False) -> torch.Tensor: - """ - Applies lora to each input. Similar to _apply_lora, This method is - used for layers that are composed of multiple sublayers - (slices) packed together. - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - - token_num = indices_info[0] - is_prefill = bool(indices_info[5]) - offset_left = 0 - # TODO fuse these kernels - for slice_idx in range(len(output_slices)): - add_lora(output, - x, - lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], - lora_index_tensor[:token_num], - 0, - 1.0, - is_prefill, - offset_left, - output_slices[slice_idx], - need_update=need_update) - offset_left += output_slices[slice_idx] - - return output.view_as(org_output) - - @dataclass class LoRAMapping: # Per every token in input_ids: @@ -202,15 +115,9 @@ def set_lora( def set_mapping( self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], + punica_wrapper: PunicaWrapper, ): - """Sets the mapping indices.""" - ... + self.punica_wrapper: PunicaWrapper = punica_wrapper @classmethod def can_replace_layer( @@ -288,10 +195,6 @@ def create_lora_weights( self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], self.lora_a_stacked.shape[2], ) - # Lazily initialized. - self.indices: torch.Tensor - self.indices_len: List[int] - self.embeddings_indices: torch.Tensor def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -326,28 +229,15 @@ def set_lora( assert self.embeddings_weights is not None self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.embeddings_indices = embeddings_indices - self.indices_len = indices_len - def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 - embedding_len = self.indices_len[3] - indices = self.embeddings_indices[1][:embedding_len].view_as(x) + embeddings_indices = self.punica_wrapper.embeddings_indices + indices = embeddings_indices[1].view_as(x) full_lora_a_embeddings = F.embedding( x + indices, self.lora_a_stacked_2d, ) - indices = self.embeddings_indices[0][:embedding_len].view_as(x) + indices = embeddings_indices[0].view_as(x) full_output = self.base_layer.forward( x.add_(indices * added_tokens_mask)) @@ -361,20 +251,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - # full_lora_a_embeddings = full_lora_a_embeddings.view( - # -1, full_lora_a_embeddings.shape[-1]) - # full_output = full_output.view(-1, full_output.shape[-1]) - token_num = self.indices_len[0] - is_prefill = bool(self.indices_len[5]) - add_expand( - full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - self.indices[:token_num], - 0, - is_prefill, - add_input=True, - ) + + # Embedding layer only need expand op + self.punica_wrapper.add_expand(full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) return full_output.view_as(full_output_org) @classmethod @@ -432,10 +314,6 @@ def create_lora_weights( ) self.output_dim = self.lora_b_stacked.shape[2] - # lazily initialized. - self.indices: torch.Tensor - self.indices_len: List[int] - def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 @@ -471,29 +349,11 @@ def set_lora( 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.indices_len = indices_len - def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices, - self.indices_len, - output, - ) + self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, + self.lora_b_stacked, 1.0) return output def forward(self, input_): @@ -587,9 +447,6 @@ def create_lora_weights( ) for _ in range(n_slices)) self.output_dim = self.lora_b_stacked[0].shape[2] - # Lazily initialized. - self.indices: torch.Tensor - self.indices_len: torch.Tensor def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 @@ -647,15 +504,9 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_packed_nslice( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices, - self.indices_len, - output, - (self.output_dim, self.output_dim), - ) + self.punica_wrapper.add_lora_packed_nslice( + output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, + (self.output_dim, self.output_dim)) return output @classmethod @@ -917,16 +768,10 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - _apply_lora_packed_nslice( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices, - self.indices_len, - output, - self.output_slices, - ) - + self.punica_wrapper.add_lora_packed_nslice(output, x, + self.lora_a_stacked, + self.lora_b_stacked, 1.0, + self.output_slices) return output @classmethod @@ -984,9 +829,6 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) - # Lazily initialized - self.indices: torch.Tensor - self.indices_len: List[int] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -1023,29 +865,10 @@ def set_lora( 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.indices_len = indices_len - def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - # maybe we need not restrict range to [:batch_size] - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices, - self.indices_len, - output, - ) + self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, + self.lora_b_stacked, 1.0) return output def forward(self, input_): @@ -1200,10 +1023,6 @@ def create_lora_weights( dtype=torch.long) else: self.sharded_to_full_mapping_gpu = None - # Lazily initialized. - self.indices: torch.Tensor - self.indices_len: List[int] - self.indices_padded: torch.Tensor def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 @@ -1229,19 +1048,6 @@ def set_lora( index, :embeddings_tensor.shape[0], :embeddings_tensor. shape[1], ] = embeddings_tensor - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = sampler_indices - self.indices_padded = sampler_indices_padded - self.indices_len = indices_len - def _get_logits( self, hidden_states: torch.Tensor, @@ -1287,34 +1093,24 @@ def _get_logits( out=lora_logits[:-1]) lora_logits[-1] = float("-inf") lora_logits = lora_logits.mT + indices_padded = self.punica_wrapper.sampler_indices_padded lora_logits = (lora_logits.reshape( lora_logits.shape[0] * lora_logits.shape[1], lora_logits.shape[2], - ).index_select(0, - self.indices_padded[:self.indices_len[2]]).nan_to_num_( - nan=float("-inf"), - posinf=float("inf"), - neginf=float("-inf"))) + ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"), + posinf=float("inf"), + neginf=float("-inf"))) logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1], ] = lora_logits - sampler_indices = self.indices_len[1] # LogitsProcessorWithLoRA always using bgmv - is_prefill = False - add_lora( - logits, - hidden_states, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:sampler_indices], - 0, - 1.0, - is_prefill, - ) + self.punica_wrapper.add_lora_logits(logits, hidden_states, + self.lora_a_stacked, + self.lora_b_stacked, 1.0) + # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] - return logits def forward(self, *args, **kwargs): @@ -1343,9 +1139,6 @@ class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA): def __init__(self, base_layer: RotaryEmbedding) -> None: super().__init__() self.base_layer = base_layer - # Lazily initialized - self.long_lora_indices: torch.Tensor - self.indices_len: List[int] @property def scaling_factors(self): @@ -1389,18 +1182,6 @@ def set_lora( ): ... - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - long_lora_indices: torch.Tensor, - indices_len: List[int], - ): - self.long_lora_indices = long_lora_indices - self.indices_len = indices_len - def forward( self, positions: torch.Tensor, @@ -1411,7 +1192,7 @@ def forward( positions, query, key, - offsets=self.long_lora_indices[:self.indices_len[4]], + offsets=self.punica_wrapper.long_lora_indices, ) @property diff --git a/vllm/lora/models.py b/vllm/lora/models.py index cf26fe66ff41..d743f1c52bf7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Callable, Dict, List, Optional, Type import safetensors.torch import torch @@ -16,7 +16,7 @@ LinearScalingRotaryEmbeddingWithLora, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.punica import PrefillHelper +from vllm.lora.punica import PunicaWrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA @@ -43,128 +43,6 @@ class LongContextLoRAContext: offsets_by_lora_id: Dict[int, int] = field(default_factory=dict) -def convert_mapping( - mapping: LoRAMapping, - lora_index_to_id: List[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional[LongContextLoRAContext] = None, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], List[int], ]: - """Converts LoRAMapping to index tensors. - - Args: - mapping: LoRAMapping mapping rows in a batch to LoRA ids. - lora_index_to_id: List mapping LoRA ids to LoRA indices. - max_loras: Maximum number of LoRAs. - vocab_size: Model vocab size. - extra_vocab_size: Extra vocab size each LoRA can have. - long_lora_context: Passed if there are long context lora in a batch. - - Returns: - A tuple of tensors: - base_indices: Tensor of shape [batch_size] mapping batch rows to - LoRA indices. - sampler_indices: Tensor of shape [batch_size] mapping requests to - LoRA indices for sampler. For generation, this will be the - same as base_indicies. For prefill, this will map requests - to LoRA indices. - sampler_indices_padded: Tensor of shape [batch_size] mapping - requests to LoRA indices for sampler with padding. - Same as sampler_indicies, but -1 is replaced with - max_loras. - embeddings_indices: Tensor of shape [2, batch_size] mapping - requests to embedding indices. First row is for embeddings - added by the LoRAs, second row is for the LoRA.lora_a - embeddings. - long_lora_indices: Tensor of shape [batch_size] mapping - requests to RoPE offsets and rot dims for long LoRAs. - None if long context lora doesn't exist. - indices_len: List of lengths of the above tensors and prefill - flag. Used to index into each tensor. It contains - (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices, prefill flag). - """ - index_mapping_indices: List[int] = list(mapping.index_mapping).copy() - embedding_indices = index_mapping_indices.copy() - lora_indices = index_mapping_indices.copy() - long_lora_offsets: Optional[torch.Tensor] = None - if long_lora_context: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", - dtype=torch.long) - prompt_mapping: List[int] = [ - lora_index_to_id.index(x) if x > 0 else -1 - for x in mapping.prompt_mapping - ] - lora_idx = None - for i in range(len(index_mapping_indices)): - # TODO index can be slow. optimize - lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) - if index_mapping_indices[i] > 0 else -1) - embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 - lora_indices[i] = lora_idx - if long_lora_context: - assert long_lora_offsets is not None - lora_offset: int = long_lora_context.offsets_by_lora_id.get( - index_mapping_indices[i], 0) - long_lora_offsets[i] = lora_offset - - indices_list: List[Union[List[int], torch.Tensor]] = [ - index_mapping_indices, - lora_indices, - embedding_indices, - ] - if long_lora_context: - assert long_lora_offsets is not None - indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") - prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", - dtype=torch.long) - embeddings_indices = torch.stack([ - indices[2] * extra_vocab_size, - indices[2] * (vocab_size + extra_vocab_size), - ]) - embeddings_indices[embeddings_indices == -1] = max_loras - 1 - base_indices = indices[1] - sampler_indices = prompt_mapping_tensor - sampler_indices_padded = sampler_indices.clone() - sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( - sampler_indices_padded * len(sampler_indices_padded)) - long_lora_indices = None - long_lora_indices_len: Optional[int] = None - if long_lora_context: - long_lora_indices = indices[3] - long_lora_indices_len = long_lora_indices.shape[-1] - # Contain length of indices tensors. Used to index into each tensor. - indices_len = [ - base_indices.shape[-1], - sampler_indices.shape[-1], - sampler_indices_padded.shape[-1], - embeddings_indices.shape[-1], - ] - if long_lora_indices_len is not None: - indices_len.append(long_lora_indices_len) - else: - #If long_lora doesn't exist,append None - indices_len.append(None) - # Append a prefill flag to help selecting the appropriate lora - # ops (sgmv or bgmv) - indices_len.append(int(mapping.is_prefill)) - return ( - base_indices, - sampler_indices, - sampler_indices_padded, - embeddings_indices, - long_lora_indices, - indices_len, - ) - - def get_lora_id(): global _GLOBAL_LORA_ID _GLOBAL_LORA_ID += 1 @@ -440,32 +318,11 @@ def __init__( self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None - self.base_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.sampler_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.embeddings_indices = torch.empty(2, - self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.long_lora_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - # Scaling factor -> offset to the sin_cos_cache to it. - # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} - # 6 is the number of indicies tensors. - # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,long_lora_indices,prefill or decode stage - self.indices_len: List[Optional[int]] = [None] * 6 - self.prefill_helper = PrefillHelper(max_batches=_MAX_BATCH_SIZE, - device=str( - self.base_indices.device)) + # maintain the state information for lora + self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, + max_batches=_MAX_BATCH_SIZE, + device="cuda") self.model = model if hasattr(self.model, "supported_lora_modules"): @@ -596,16 +453,9 @@ def pin_lora(self, lora_id: int) -> bool: "Pinning is not supported in LoRAModelManager." "Use LRUCacheLoRAModelManager for pinning") # type: ignore - # TODO see if this can be vectorized def _set_lora_mapping(self, mapping: LoRAMapping) -> None: - ( - base_indices, - sampler_indices, - sampler_indices_padded, - embeddings_indices, - long_lora_offsets_tensor, - indices_len, - ) = convert_mapping( + # update lora states + self.punica_wrapper.update_metadata( mapping, self.lora_index_to_id, self.lora_slots + 1, @@ -613,23 +463,6 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None: self.lora_config.lora_extra_vocab_size, self.long_lora_context, ) - self.base_indices[:base_indices.shape[0]].copy_(base_indices) - self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) - self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( - sampler_indices_padded) - self.embeddings_indices[:embeddings_indices. - shape[0], :embeddings_indices.shape[1]].copy_( - embeddings_indices) - if long_lora_offsets_tensor is not None: - self.long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( - long_lora_offsets_tensor) - else: - self.long_lora_indices.zero_() - # Maintain the reference - self.indices_len[:] = indices_len - if mapping.is_prefill: - self.prefill_helper.get_metadata( - self.base_indices[:base_indices.shape[0]], need_update=True) def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: if self._last_mapping != lora_mapping: @@ -691,14 +524,7 @@ def _create_lora_modules(self): ) self.register_module(module_name, new_module) self._register_packed_modules(module_name) - new_module.set_mapping( - self.base_indices, - self.sampler_indices, - self.sampler_indices_padded, - self.embeddings_indices, - self.long_lora_indices, - self.indices_len, - ) + new_module.set_mapping(self.punica_wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index f3a53b70f415..2873882bc263 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -159,10 +159,10 @@ def sgmv_expand( torch.bfloat16, ]: CAST_TYPE = True - grid = [ + grid = ( triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), batches, - ] + ) _sgmv_expand_kernel[grid]( inputs, lora_b_weights, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 52c71c5095b5..2078a47d7e8e 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -171,10 +171,10 @@ def sgmv_expand_slice( torch.bfloat16, ]: CAST_TYPE = True - grid = [ + grid = ( triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), batches, - ] + ) _sgmv_expand_slice_kernel[grid]( inputs, lora_b_weights, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index f295f0118f0b..094bc62d9da4 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -153,11 +153,11 @@ def sgmv_shrink( BLOCK_K = 32 SPLIT_K = 8 EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 - grid = [ + grid = ( triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N), SPLIT_K, batches, - ] + ) _sgmv_shrink_kernel[grid]( inputs, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 5347a68852a0..da51105fd907 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -3,9 +3,9 @@ Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). Punica: Multi-Tenant LoRA Serving. https://arxiv.org/abs/2310.18547 -""" - -from typing import Optional, Tuple +# """ +# from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union import torch @@ -16,17 +16,23 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +if TYPE_CHECKING: + # avodi circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext + +@torch.compile def _compute_meta( token_lora_tensor: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: """ Get the information required for the sgmv kernel. With the features: - 1. If consecutive requests in the batch use the same LoRA, this function - will combine them into a single request, improving sgmv kernel inference + 1. If consecutive requests in the batch use the same LoRA, this function + will combine them into a single request, improving sgmv kernel inference performance. - 2. At the beginning of each prefill stage inference, recalculations are - needed based on the input, but only once. + 2. At the beginning of each prefill stage inference, recalculations are + needed based on the input, but only once. """ lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( @@ -45,43 +51,229 @@ def _compute_meta( ) -class PrefillHelper: - """PrefillHelper is designed to manage and provide metadata for the sgmv - kernel during prefill stage, utilizing the singleton pattern to guarantee - the existence of only one instance of the class. +# TODO see if this can be vectorized +def convert_mapping( + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor], List[int]]: + """Converts LoRAMapping to index tensors. + + Args: + mapping: LoRAMapping mapping rows in a batch to LoRA ids. + lora_index_to_id: List mapping LoRA ids to LoRA indices. + max_loras: Maximum number of LoRAs. + vocab_size: Model vocab size. + extra_vocab_size: Extra vocab size each LoRA can have. + long_lora_context: Passed if there are long context lora in a batch. + + Returns: + A tuple of tensors: + base_indices: Tensor of shape [batch_size] mapping batch rows to + LoRA indices. + sampler_indices: Tensor of shape [batch_size] mapping requests to + LoRA indices for sampler. For generation, this will be the + same as base_indicies. For prefill, this will map requests + to LoRA indices. + sampler_indices_padded: Tensor of shape [batch_size] mapping + requests to LoRA indices for sampler with padding. + Same as sampler_indicies, but -1 is replaced with + max_loras. + embeddings_indices: Tensor of shape [2, batch_size] mapping + requests to embedding indices. First row is for embeddings + added by the LoRAs, second row is for the LoRA.lora_a + embeddings. + long_lora_indices: Tensor of shape [batch_size] mapping + requests to RoPE offsets and rot dims for long LoRAs. + None if long context lora doesn't exist. + indices_len: List of lengths of the above tensors. It contains + (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices, long_lora_indices). """ - _instance: Optional["PrefillHelper"] = None - initialized: bool + index_mapping_indices: List[int] = list(mapping.index_mapping).copy() + embedding_indices = index_mapping_indices.copy() + lora_indices = index_mapping_indices.copy() + long_lora_offsets: Optional[torch.Tensor] = None + if long_lora_context: + long_lora_offsets = torch.zeros(len(index_mapping_indices), + device="cuda", + dtype=torch.long) + prompt_mapping: List[int] = [ + lora_index_to_id.index(x) if x > 0 else -1 + for x in mapping.prompt_mapping + ] + lora_idx = None + for i in range(len(index_mapping_indices)): + # TODO index can be slow. optimize + lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) + if index_mapping_indices[i] > 0 else -1) + embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 + lora_indices[i] = lora_idx + if long_lora_context: + assert long_lora_offsets is not None + lora_offset: int = long_lora_context.offsets_by_lora_id.get( + index_mapping_indices[i], 0) + long_lora_offsets[i] = lora_offset + + indices_list: List[Union[List[int], torch.Tensor]] = [ + index_mapping_indices, + lora_indices, + embedding_indices, + ] + if long_lora_context: + assert long_lora_offsets is not None + indices_list.append(long_lora_offsets) + indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + prompt_mapping_tensor = torch.tensor(prompt_mapping, + device="cuda", + dtype=torch.long) + embeddings_indices = torch.stack([ + indices[2] * extra_vocab_size, + indices[2] * (vocab_size + extra_vocab_size), + ]) + embeddings_indices[embeddings_indices == -1] = max_loras - 1 + base_indices = indices[1] + sampler_indices = prompt_mapping_tensor + sampler_indices_padded = sampler_indices.clone() + sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 + sampler_indices_padded = torch.arange( + 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( + sampler_indices_padded * len(sampler_indices_padded)) + long_lora_indices = None + long_lora_indices_len: Optional[int] = None + if long_lora_context: + long_lora_indices = indices[3] + long_lora_indices_len = long_lora_indices.shape[-1] + # Contain length of indices tensors. Used to index into each tensor. + indices_len = [ + base_indices.shape[-1], + sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], + embeddings_indices.shape[-1], + ] + if long_lora_indices_len is not None: + indices_len.append(long_lora_indices_len) + else: + # If long_lora doesn't exist,append None + indices_len.append(None) - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance.initialized = False - return cls._instance + return ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_indices, + indices_len, + ) - def __init__(self, max_batches: int = 256, device: str = "cuda"): - """ - Args: - max_batches (int, optional): the maximum batch to pre-allocate. - Defaults to 256. - device (str, optional): Defaults to "cuda". - """ - if not self.initialized: - self.initialized = True - # these attributes are the information required for sgmv kernel - self.b_seq_start_tensor = torch.zeros(max_batches, - dtype=torch.long, - device=device) - self.seq_length_tensor = torch.empty(max_batches, - dtype=torch.long, - device=device) - self.lora_indices_tensor = torch.empty(max_batches, + +class PunicaWrapper: + """PunicaWrapper is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica operator. + """ + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: str): + self._token_lora_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + self._sampler_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + self._sampler_indices_padded = torch.empty(max_num_batched_tokens, dtype=torch.long, device=device) - self.max_length: int = 0 - self.batch_size: int = -1 - - def _update_metada(self, token_lora_tensor: torch.Tensor) -> None: + self._embeddings_indices = torch.empty(2, + max_num_batched_tokens, + dtype=torch.long, + device=device) + self._long_lora_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + + # 5 is the number of indicies tensors. + # base_indices, sampler_indices, sampler_indices_padded, + # embeddings_indices,long_lora_indices + self.indices_len: List[Optional[int]] = [None] * 5 + # these attributes are the information required for sgmv kernel + self.b_seq_start_tensor = torch.zeros(max_batches, + dtype=torch.long, + device=device) + self.seq_length_tensor = torch.empty(max_batches, + dtype=torch.long, + device=device) + self.lora_indices_tensor = torch.empty(max_batches, + dtype=torch.long, + device=device) + self.max_length: int = 0 + self.batch_size: int = -1 + self.is_prefill = False + + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + + self._update_base_metadata(mapping, lora_index_to_id, max_loras, + vocab_size, extra_vocab_size, + long_lora_context) + if mapping.is_prefill: + # Update metadata required for prefill-related operators. + self._update_prefill_metada(self.token_lora_indices) + self.is_prefill = True + else: + self.is_prefill = False + + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + long_lora_context, + ) + self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) + self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) + self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( + sampler_indices_padded) + self._embeddings_indices[:embeddings_indices. + shape[0], :embeddings_indices.shape[1]].copy_( + embeddings_indices) + if long_lora_offsets_tensor is not None: + self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( + long_lora_offsets_tensor) + else: + self._long_lora_indices.zero_() + + self.indices_len[:] = indices_len + + def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, batch_size, max_length) = _compute_meta(token_lora_tensor) @@ -95,280 +287,303 @@ def _update_metada(self, token_lora_tensor: torch.Tensor) -> None: self.batch_size = batch_size self.max_length = max_length - def get_metadata( - self, - token_lora_tensor: torch.Tensor, - need_update: bool = False - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]: - - #Need to recalculate and fill metadata. - if need_update: - self._update_metada(token_lora_tensor) - + @property + def prefill_metadata( + self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: + """ + This property provides a convenient way to access the necessary + metadata for prefill-related kernel computations. It returns a tuple + containing: + 1. b_seq_start_tensor: Tensor of sequence start positions + 2. seq_length_tensor: Tensor of sequence lengths + 3. lora_indices_tensor: Tensor of lora indices + 4. batch_size: batch size after clustering identical lora indices + 5. max_length: The maximum sequence length in the batch + """ return (self.b_seq_start_tensor[:self.batch_size], self.seq_length_tensor[:self.batch_size], self.lora_indices_tensor[:self.batch_size], self.batch_size, self.max_length) + @property + def token_lora_indices(self) -> torch.Tensor: + """ + This property provides the lora indices corresponding to each token + in the batch + """ + token_lora_len = self.indices_len[0] + return self._token_lora_indices[:token_lora_len] + + @property + def sampler_indices(self) -> torch.Tensor: + """ + This property is used to access the lora indices specifically for + LogitsProcessorWithLoRA + """ + sampler_indices_len = self.indices_len[1] + return self._sampler_indices[:sampler_indices_len] -def get_prefill_meta(token_lora_tensor: torch.Tensor, - need_update: bool = False): - prefill_helper = PrefillHelper(max_batches=256, - device=str(token_lora_tensor.device)) - return prefill_helper.get_metadata(token_lora_tensor, need_update) + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices + """ + indices_padded_len = self.indices_len[2] + return self._sampler_indices_padded[:indices_padded_len] + @property + def embeddings_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for lora embeddings, + specifically for VocabParallelEmbeddingWithLoRA + """ + embeddings_indices_len = self.indices_len[3] + return self._embeddings_indices[:, :embeddings_indices_len] + + @property + def long_lora_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for long context + lora, specifically for LinearScalingRotaryEmbeddingWithLora + """ + long_lora_len = self.indices_len[4] + return self._long_lora_indices[:long_lora_len] -def shrink_prefill( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - scale: float, - need_update: bool = False, -): - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = get_prefill_meta(lora_indices_tensor, need_update) - sgmv_shrink( - x, - w_t_all, - y, - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - scale, - ) + def shrink_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + sgmv_shrink( + x, + w_t_all, + y, + *self.prefill_metadata, + scale, + ) + + def shrink_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + + def expand_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool, + ): + sgmv_expand( + x, + w_t_all, + y, + *self.prefill_metadata, + add_input, + ) + + def expand_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool, + ): + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) + + def expand_slice_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool, + ): + sgmv_expand_slice( + x, + w_t_all, + y, + *self.prefill_metadata, + y_offset, + y_slice_size, + add_input, + ) + + def expand_slice_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool, + ): + bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, + y_slice_size, add_input) + + def add_shrink( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the shrink_decode function + should be called. + """ + shrink_fun: Callable = (self.shrink_prefill + if self.is_prefill else self.shrink_decode) + shrink_fun(y, x, w_t_all, scale) + def add_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool = True, + ): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'b. + When `is_prefill` is true, it indicates that it is currently the + prefill stage, and the `expand_prefill` function should be called. + Otherwise, it is the decode stage, and the expand_decode function + should be called. + """ -def shrink_decode( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - scale: float, -): - bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale) - - -def expand_prefill( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - add_input: bool, - need_update: bool = False, -): - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = get_prefill_meta(lora_indices_tensor, need_update) - sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, - last_lora_indices_tensor, batch_size, max_length, add_input) - - -def expand_decode( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - add_input: bool, -): - bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_input) - - -def expand_slice_prefill( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool, - need_update: bool = False, -): - ( - b_seq_start_tensor, - seq_length_tensor, - last_lora_indices_tensor, - batch_size, - max_length, - ) = get_prefill_meta(lora_indices_tensor, need_update) - sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor, - last_lora_indices_tensor, batch_size, max_length, - y_offset, y_slice_size, add_input) - - -def expand_slice_decode(y: torch.Tensor, x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, layer_idx: int, - y_offset: Optional[int], y_slice_size: Optional[int], - add_input: bool): - bgmv_expand_slice(x, w_t_all, y, lora_indices_tensor, y_offset, - y_slice_size, add_input) - - -def add_shrink( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - scale: float, - is_prefill: bool, - need_update: bool = False, -): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function - should be called. - """ - if is_prefill: - shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale, - need_update) - else: - shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale) - - -def add_expand( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - is_prefill: bool, - add_input: bool = True, - need_update: bool = False, -): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'b. - When `is_prefill` is true, it indicates that it is currently the - prefill stage, and the `expand_prefill` function should be called. - Otherwise, it is the decode stage, and the expand_decode function - should be called. - """ - if is_prefill: - expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, - add_input, need_update) - else: - expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input) - - -def add_expand_slice( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - is_prefill: bool, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True, - need_update: bool = False, -): - """ - Similar to `add_expand` - """ - if is_prefill: - expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, - y_offset, y_slice_size, add_input, need_update) - else: - expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, - y_offset, y_slice_size, add_input) - - -def add_lora( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - lora_indices_tensor: torch.Tensor, - layer_idx: int, - scale: float, - is_prefill: bool, - y_offset: Optional[int] = None, - y_slice_size: Optional[int] = None, - *, - buffer: Optional[torch.Tensor] = None, - need_update: bool = False, -): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - Args: - y (torch.Tensor): Output tensor. Will be changed in-place. - x (torch.Tensor): Input tensor - wa_t_all (torch.Tensor): lora_a's weight - wb_t_all (torch.Tensor): lora_b's weight - lora_indices_tensor (torch.Tensor): _description_ - layer_idx (int): Layer index of LoRA weights. - scale (float): Scaling factor. - is_prefill (bool): prefiling stage - y_offset (Optional[int], optional): Offset to apply to the starting - column of y. - y_slice_size (Optional[int], optional): Size of the y column slice.. - buffer (Optional[torch.Tensor], optional): Defaults to None. - need_update (bool, optional): Indicates whether updating sgmv metadata - is needed. Defaults to False. - """ + expand_fun: Callable = (self.expand_prefill + if self.is_prefill else self.expand_decode) + expand_fun(y, x, w_t_all, add_input) + + def add_expand_slice(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): + """ + Similar to `add_expand` + """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - - add_shrink( - buffer, - x, - wa_t_all, - lora_indices_tensor, - 0, - scale, - is_prefill, - need_update=need_update, - ) - if y_offset is None and y_slice_size is None: - add_expand(y, - buffer, - wb_t_all, - lora_indices_tensor, - 0, - is_prefill, - add_input=True, - need_update=need_update) - else: - add_expand_slice(y, - buffer, - wb_t_all, - lora_indices_tensor, - 0, - is_prefill, - y_offset, - y_slice_size, - add_input=True, - need_update=need_update) + expand_slice_fun: Callable = (self.expand_slice_prefill + if self.is_prefill else + self.expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + + def add_lora(self, + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + scale: float, + y_offset: Optional[int] = None, + y_slice_size: Optional[int] = None, + *, + buffer: Optional[torch.Tensor] = None) -> None: + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + wa_t_all (torch.Tensor): lora_a's weight + wb_t_all (torch.Tensor): lora_b's weight + scale (float): Scaling factor. + y_offset (Optional[int], optional): Offset to apply to the starting + column of y. + y_slice_size (Optional[int], optional): Size of the y column slice.. + buffer (Optional[torch.Tensor], optional): Defaults to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + + self.add_shrink(buffer, x, wa_t_all, scale) + if y_offset is None and y_slice_size is None: + self.add_expand(y, buffer, wb_t_all, add_input=True) + else: + self.add_expand_slice(y, + buffer, + wb_t_all, + y_offset, + y_slice_size, + add_input=True) + y = y.view_as(y_org) + + def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, + torch.Tensor, + torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, + torch.Tensor, + torch.Tensor], + scale: float, + output_slices: Tuple[int, ...]) -> None: + """ + Applies lora to each input. Similar to add_lora, This method is + used for layers that are composed of multiple sublayers + (slices) packed together. + """ + y_org = y + x = x.view(-1, x.shape[-1]) + y = y.view(-1, y.shape[-1]) + offset_left = 0 + # TODO fuse these kernels + for slice_idx in range(len(output_slices)): + self.add_lora(y, x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], scale, offset_left, + output_slices[slice_idx]) + offset_left += output_slices[slice_idx] + + y = y.view_as(y_org) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None) -> None: + """ + LogitsProcessorWithLoRA always using bgmv + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + + bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) + y = y.view_as(y_org) From 7035a2903d1c303122f0a06e5b89a347977786bb Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 11 Jul 2024 11:38:49 +0800 Subject: [PATCH 49/71] update lora unit test --- tests/lora/test_layers.py | 140 +++++++++++-------- tests/lora/test_lora.py | 263 ------------------------------------ vllm/worker/model_runner.py | 12 +- 3 files changed, 89 insertions(+), 326 deletions(-) delete mode 100644 tests/lora/test_lora.py diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 7207af6b1a4b..6f33f56616fc 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -26,7 +26,8 @@ VocabParallelEmbeddingWithLoRA) # yapf: enable from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, - PackedLoRALayerWeights, convert_mapping) + PackedLoRALayerWeights) +from vllm.lora.punica import PunicaWrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -47,6 +48,9 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +# We will launch different triton kernels between the prefill and decode +# stages, so we need to verify this. prefill stage(True) or decode stage(False) +STAGES = [True, False] def get_random_id_to_index(num_loras: int, @@ -182,10 +186,12 @@ def create_random_inputs( @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) -def test_embeddings(dist_init, num_loras, device, vocab_size) -> None: +@pytest.mark.parametrize("stage", STAGES) +def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: torch.set_default_device(device) max_loras = 8 + punica_wrapper = PunicaWrapper(8192, 256, device) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -204,7 +210,7 @@ def create_random_embedding_layer(): id_to_index = get_random_id_to_index(num_loras, max_loras) embedding, lora_embedding = create_random_embedding_layer() - + lora_embedding.set_mapping(punica_wrapper) lora_dict, _ = populate_loras( id_to_index, layer=lora_embedding, @@ -217,12 +223,12 @@ def create_random_embedding_layer(): input_size=(200, ), input_range=(1, vocab_size), ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, vocab_size, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info) lora_result = lora_embedding(torch.cat(inputs)) @@ -255,12 +261,12 @@ def create_random_embedding_layer(): input_size=(200, ), input_range=(1, vocab_size), ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, vocab_size, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) lora_result = lora_embedding(torch.cat(inputs)) expected_result = embedding(torch.cat(inputs)) @@ -278,11 +284,13 @@ def create_random_embedding_layer(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("stage", STAGES) def test_embeddings_with_new_embeddings(dist_init, num_loras, device, - vocab_size) -> None: + vocab_size, stage) -> None: torch.set_default_device(device) max_loras = 8 + punica_wrapper = PunicaWrapper(8192, 256, device) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -318,6 +326,7 @@ def create_random_embedding_layer(): generate_embeddings_tensor=256, ) + lora_embedding.set_mapping(punica_wrapper) # All embeddings tensors have the same shape. embeddings_tensors = [ lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) @@ -334,8 +343,12 @@ def create_random_embedding_layer(): input_size=(200, ), input_range=(1, vocab_size), ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) original_inputs = deepcopy(inputs) # Force some of the inputs to be in the extended embeddings range @@ -349,11 +362,6 @@ def create_random_embedding_layer(): (embedding_id + 1) * embeddings_tensor_len - 1) original_input_[-2] = vocab_size + embeddings_tensor_len - 1 - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - vocab_size, - lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) - expanded_embedding.weight[vocab_size:vocab_size + (embeddings_tensor_len * max_loras)] = torch.cat(embeddings_tensors) @@ -390,15 +398,13 @@ def create_random_embedding_layer(): input_size=(200, ), input_range=(1, vocab_size), ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - original_inputs = deepcopy(inputs) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, vocab_size, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) - lora_result = lora_embedding(torch.cat(original_inputs)) expected_result = expanded_embedding(torch.cat(inputs)) @@ -413,11 +419,13 @@ def create_random_embedding_layer(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) -def test_lm_head_logits_processor(dist_init, num_loras, device, - vocab_size) -> None: +@pytest.mark.parametrize("stage", STAGES) +def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, + stage) -> None: torch.set_default_device(device) max_loras = 8 + punica_wrapper = PunicaWrapper(8192, 256, device) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -443,7 +451,7 @@ def _pretest(): id_to_index = get_random_id_to_index(num_loras, max_loras) linear, logits_processor, lora_logits_processor = _pretest() - + lora_logits_processor.set_mapping(punica_wrapper) # NOTE: all the generated loras share the same embeddings tensor. lora_dict, _ = populate_loras( id_to_index, @@ -461,17 +469,17 @@ def _pretest(): input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - input_ = torch.rand(20, 1024) - mapping_info = convert_mapping( + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( lora_mapping, id_to_index, max_loras, vocab_size, lora_config.lora_extra_vocab_size, ) - lora_logits_processor.set_mapping(*mapping_info, ) + input_ = torch.rand(20, 1024) lora_result = lora_logits_processor._get_logits( hidden_states=torch.cat(inputs), @@ -510,12 +518,16 @@ def _pretest(): input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - vocab_size, - lora_config.lora_extra_vocab_size) - lora_logits_processor.set_mapping(*mapping_info, ) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + vocab_size, + lora_config.lora_extra_vocab_size, + ) lora_result = lora_logits_processor._get_logits( hidden_states=torch.cat(inputs), @@ -538,10 +550,12 @@ def _pretest(): @pytest.mark.parametrize("orientation", ["row", "column"]) @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("stage", STAGES) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device) -> None: + device, stage) -> None: torch.set_default_device(device) + punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, @@ -575,7 +589,7 @@ def create_random_linear_parallel_layer(): id_to_index = get_random_id_to_index(num_loras, max_loras) linear, lora_linear = create_random_linear_parallel_layer() - + lora_linear.set_mapping(punica_wrapper) lora_dict, _ = populate_loras( id_to_index, layer=lora_linear, @@ -589,16 +603,16 @@ def create_random_linear_parallel_layer(): input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping( + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size, ) - lora_linear.set_mapping(*mapping_info, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -628,11 +642,12 @@ def create_random_linear_parallel_layer(): input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size) - lora_linear.set_mapping(*mapping_info, ) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] @@ -649,10 +664,12 @@ def create_random_linear_parallel_layer(): @pytest.mark.parametrize("repeats", [1, 2, 3]) @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("stage", STAGES) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device) -> None: + device, stage) -> None: torch.set_default_device(device) + punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, @@ -707,7 +724,7 @@ class FakeConfig: id_to_index = get_random_id_to_index(num_loras, max_loras) linear, lora_linear = create_column_parallel_packed_layer() - + lora_linear.set_mapping(punica_wrapper) lora_dict, sublora_dict = populate_loras( id_to_index, layer=lora_linear, @@ -722,16 +739,17 @@ class FakeConfig: input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) - mapping_info = convert_mapping( + punica_wrapper.update_metadata( lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size, ) - lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] @@ -762,16 +780,18 @@ class FakeConfig: input_range=(0, 1), input_type=torch.float16, ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) - mapping_info = convert_mapping( + punica_wrapper.update_metadata( lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size, ) - lora_linear.set_mapping(*mapping_info) + # lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] @@ -803,7 +823,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_default_device(device) - + punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, @@ -825,6 +845,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, is_neox_style, ) lora_rope = LinearScalingRotaryEmbeddingWithLora(rope) + lora_rope.set_mapping(punica_wrapper) lora_rope.create_lora_weights(max_loras, lora_config) linear_rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { @@ -840,6 +861,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, input_range=(0, lora_config.lora_extra_vocab_size), input_type=torch.float16, ) + lora_mapping = LoRAMapping(index_mapping, prompt_mapping) long_lora_context = LongContextLoRAContext(list(scaling_factors), rotary_dim) @@ -854,7 +876,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, for i in range(len(scaling_factors)): long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get( scaling_factors[i], 0) - mapping_info = convert_mapping( + punica_wrapper.update_metadata( lora_mapping, id_to_index, max_loras, @@ -862,7 +884,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, lora_config.lora_extra_vocab_size, long_lora_context=long_lora_context, ) - lora_rope.set_mapping(*mapping_info) + # lora_rope.set_mapping(*mapping_info) positions = torch.randint(0, max_position, (batch_size, seq_len)) query = torch.randn(batch_size, diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py deleted file mode 100644 index a4ca7a93e62e..000000000000 --- a/tests/lora/test_lora.py +++ /dev/null @@ -1,263 +0,0 @@ -import pytest -import torch - -from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice - -from .utils import DummyLoRAManager - -TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] -QKV_TENSOR_SIZES = [ - (8192, 1024, 1024), - (8192 // 8, 1024 // 8, 1024 // 8), - (4096, 4096, 4096), - (4096 // 2, 4096 // 2, 4096 // 2), -] -BATCH_SIZES = [8, 32, 256] -RANKS = [8] -DTYPES = [torch.float16] -TOLERANCES = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), -} - -STAGES = [0, 1] #prefill stage(1) or decode stage(0) - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("stage", STAGES) -def test_apply_lora(m, n, k, rank, dtype, stage) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m, n], device="cuda", dtype=dtype) - - manager.init_random_lora(module_name, weight, rank=rank) - lora = manager.get_module_lora(module_name) - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - - lora_a_stack = torch.zeros(8, - 1, - lora.lora_a.shape[1], - lora.lora_a.shape[0], - device="cuda", - dtype=dtype) - lora_b_stack = torch.zeros(8, - 1, - lora.lora_b.shape[1], - lora.lora_b.shape[0], - device="cuda", - dtype=dtype) - for i in range(lora_a_stack.shape[0]): - lora_a_stack[i][0] = lora.lora_a.T - lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T - indices_info = [None] * 6 - indices_info[0] = k - indices_info[5] = stage - output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora(input, - lora_a_stack, - lora_b_stack, - torch.randint(0, - lora_a_stack.shape[0], (len(input), ), - device="cuda"), - indices_info, - output, - need_update=True) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora(input, - lora_a_stack, - lora_b_stack, - torch.full((len(input), ), -1, device="cuda"), - indices_info, - output, - need_update=True) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("stage", STAGES) -def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None: - if m % 2 != 0: - pytest.skip("m must be divisible by 2") - if m // 2 not in TENSOR_SIZES: - pytest.skip("m//2 must be in TENSOR_SIZES") - - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m // 2, n], device="cuda", dtype=dtype) - - manager.init_random_lora(module_name + "1", weight, rank=rank) - lora_1 = manager.get_module_lora(module_name + "1") - manager.init_random_lora(module_name + "2", weight, rank=rank) - lora_2 = manager.get_module_lora(module_name + "2") - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = torch.cat([ - input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, - input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_a.shape[1], - lora_1.lora_a.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_b.shape[1], - lora_1.lora_b.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_1.lora_a.T - lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T - lora_a_stacks[1][i][0] = lora_2.lora_a.T - lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T - indices_info = [None] * 6 - indices_info[0] = k - indices_info[5] = stage - output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora_packed_nslice(input, - lora_a_stacks, - lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], - (len(input), ), - device="cuda"), - indices_info, - output, (m // 2, m // 2), - need_update=True) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora_packed_nslice(input, - lora_a_stacks, - lora_b_stacks, - torch.full((len(input), ), -1, device="cuda"), - indices_info, - output, (m // 2, m // 2), - need_update=True) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("stage", STAGES) -def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype) - weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype) - - manager.init_random_lora(module_name + "q", weight_q, rank=rank) - lora_q = manager.get_module_lora(module_name + "q") - manager.init_random_lora(module_name + "k", weight_kv, rank=rank) - lora_k = manager.get_module_lora(module_name + "k") - manager.init_random_lora(module_name + "v", weight_kv, rank=rank) - lora_v = manager.get_module_lora(module_name + "v") - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = torch.cat([ - input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, - input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, - input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_a.shape[1], - lora_q.lora_a.shape[0], - device="cuda", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_a.shape[1], - lora_k.lora_a.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_b.shape[1], - lora_q.lora_b.shape[0], - device="cuda", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_b.shape[1], - lora_k.lora_b.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_q.lora_a.T - lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T - lora_a_stacks[1][i][0] = lora_k.lora_a.T - lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T - lora_a_stacks[2][i][0] = lora_v.lora_a.T - lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T - indices_info = [None] * 6 - indices_info[0] = k - indices_info[5] = stage #decoding stage - output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype) - _apply_lora_packed_nslice(input, - lora_a_stacks, - lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], - (len(input), ), - device="cuda"), - indices_info, - output, (qkv[0], qkv[1], qkv[2]), - need_update=True) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora_packed_nslice(input, - lora_a_stacks, - lora_b_stacks, - torch.full((len(input), ), -1, device="cuda"), - indices_info, - output, (qkv[0], qkv[1], qkv[2]), - need_update=True) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 7fc5febcd249..cdb84caebcfc 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -790,8 +790,10 @@ def _prepare_model_input_tensors( ) if self.lora_config: - lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping, - is_prompt) + lora_mapping = LoRAMapping( + **dict(index_mapping=lora_index_mapping, + prompt_mapping=lora_prompt_mapping, + is_prefill=is_prompt)) else: lora_mapping = None @@ -1138,8 +1140,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: ) if self.lora_config: - lora_mapping = LoRAMapping([0] * batch_size, - [0] * batch_size, False) + lora_mapping = LoRAMapping( + **dict(index_mapping=[0] * batch_size, + prompt_mapping=[0] * batch_size, + is_prefill=False)) self.set_active_loras(set(), lora_mapping) if self.prompt_adapter_config: From 391d7614dedee03fa6c44c7cdec768559b6d1841 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 12 Jul 2024 14:05:20 +0800 Subject: [PATCH 50/71] reduce triton overhead --- vllm/lora/ops/libentry.py | 137 +++++++++++++++++++++++++++++ vllm/lora/ops/sgmv_expand.py | 3 + vllm/lora/ops/sgmv_expand_slice.py | 3 + vllm/lora/ops/sgmv_shrink.py | 3 + vllm/lora/punica.py | 54 ++++++------ 5 files changed, 172 insertions(+), 28 deletions(-) create mode 100644 vllm/lora/ops/libentry.py diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py new file mode 100644 index 000000000000..d3fcc1d8e05b --- /dev/null +++ b/vllm/lora/ops/libentry.py @@ -0,0 +1,137 @@ +# Modified from: https://github.com/FlagOpen/FlagGems +import inspect + +import triton + + +class LibEntry(triton.KernelInterface): + + def __init__( + self, + fn, + ): + self.fn = fn + self.arg_names = fn.arg_names + self.divisibility = 16 + self.kernel_cache = dict() + fn = self.fn + while not isinstance(fn, triton.runtime.JITFunction): + fn = fn.fn + self.jit_function: triton.runtime.JITFunction = fn + self.specialize_indices = [ + p.num for p in self.jit_function.params + if not p.is_constexpr and not p.do_not_specialize + ] + self.do_not_specialize_indices = [ + p.num for p in self.jit_function.params + if not p.is_constexpr and p.do_not_specialize + ] + + def key(self, spec_args, dns_args, const_args): + spec_key = [(arg.dtype, arg.data_ptr() % + self.divisibility == 0) if hasattr(arg, "data_ptr") else + (type(arg), arg) for arg in spec_args] + dns_key = [ + arg.dtype if hasattr( + arg, "data_ptr") else type(arg) if not isinstance(arg, int) + else "i32" if -(2**31) <= arg and arg <= 2**31 - + 1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64" + for arg in dns_args + ] + # const args passed by position + return tuple(spec_key + dns_key + const_args) + + def run(self, *args, **kwargs): + grid = kwargs["grid"] + + # collect all the arguments + spec_args = [] # specialize arguments + dns_args = [] # do not specialize arguments + const_args = [] # constexpr arguments + k_args = [] # kernel arguments + for i, arg in enumerate(args): + if i in self.specialize_indices: + k_args.append(arg) + spec_args.append(arg) + elif i in self.do_not_specialize_indices: + k_args.append(arg) + dns_args.append(arg) + else: + const_args.append(arg) + for p in self.jit_function.params[len(args):]: + if p.name in kwargs: + val = kwargs[p.name] + elif p.default is inspect._empty: + continue + else: + val = p.default + + if p.is_constexpr: + const_args.append(val) + elif p.do_not_specialize: + dns_args.append(val) + k_args.append(val) + else: + spec_args.append(val) + k_args.append(val) + + entry_key = self.key(spec_args, dns_args, const_args) + + if entry_key not in self.kernel_cache: + kernel = self.fn.run(*args, **kwargs) + fn = self.fn + # collect constexpr arguments for grid computation + constexprs = {} + while not isinstance(fn, triton.runtime.JITFunction): + if isinstance(fn, triton.runtime.Autotuner): + config = fn.best_config + constexprs["num_warps"] = config.num_warps + constexprs["num_stages"] = config.num_stages + constexprs["num_ctas"] = config.num_ctas + constexprs = {**constexprs, **config.kwargs} + elif isinstance(fn, triton.runtime.Heuristics): + for v, heur in fn.values.items(): + constexprs[v] = heur({ + **dict(zip(fn.arg_names, args)), + **kwargs, + **constexprs, + }) + else: + raise RuntimeError("Invalid Runtime Function") + fn = fn.fn + for p in self.jit_function.params: + if p.is_constexpr and p.name not in constexprs: + constexprs[p.name] = p.default + self.kernel_cache[entry_key] = (kernel, constexprs) + else: + kernel, constexprs = self.kernel_cache[entry_key] + + if callable(grid): + # collect all arguments to the grid fn,ie: + # 1. args, + # 2. kwargs, + # 3. all all other captured arguments in CompiledKernel from + # Autotunner & Heuristics when kwargs & captured args conflict, + # captured args have higher priority + meta = {**dict(zip(self.arg_names, args)), **kwargs, **constexprs} + grid = grid(meta) + + grid = grid + (1, 1) + + kernel[grid[0:3]](*k_args) + return + + +def libentry(): + """ + Decorator for triton library entries. + Motivation: + The runtime overhead of Triton kernels is the reason for the lower + performance of small kernels, particularly evident with smaller models. + Using this decorator can reduce Triton runtime overhead. + """ + + def decorator(fn): + return LibEntry(fn) + + return decorator diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 2873882bc263..27e91f5d1e4e 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_expand_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 2078a47d7e8e..2906500e7873 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_expand_slice_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 094bc62d9da4..c5bc1c08364c 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_shrink_kernel( input_ptr, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index da51105fd907..16d41cfa11ff 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -3,8 +3,8 @@ Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). Punica: Multi-Tenant LoRA Serving. https://arxiv.org/abs/2310.18547 -# """ -# from dataclasses import dataclass, field +""" + from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union import torch @@ -22,8 +22,7 @@ from vllm.lora.models import LongContextLoRAContext -@torch.compile -def _compute_meta( +def compute_meta( token_lora_tensor: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: """ @@ -172,7 +171,8 @@ def convert_mapping( class PunicaWrapper: - """PunicaWrapper is designed to manage and provide metadata for the punica + """ + PunicaWrapper is designed to manage and provide metadata for the punica kernel. The main function is to maintain the state information for Multi-LoRA, and to provide the interface for the punica operator. """ @@ -201,15 +201,15 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, # embeddings_indices,long_lora_indices self.indices_len: List[Optional[int]] = [None] * 5 # these attributes are the information required for sgmv kernel - self.b_seq_start_tensor = torch.zeros(max_batches, - dtype=torch.long, - device=device) - self.seq_length_tensor = torch.empty(max_batches, - dtype=torch.long, - device=device) - self.lora_indices_tensor = torch.empty(max_batches, - dtype=torch.long, - device=device) + self._seq_start_locs = torch.empty(max_batches, + dtype=torch.long, + device=device) + self._seq_lengths = torch.empty(max_batches, + dtype=torch.long, + device=device) + self._lora_indices_per_batch = torch.empty(max_batches, + dtype=torch.long, + device=device) self.max_length: int = 0 self.batch_size: int = -1 self.is_prefill = False @@ -276,13 +276,12 @@ def _update_base_metadata( def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, - batch_size, max_length) = _compute_meta(token_lora_tensor) + batch_size, max_length) = compute_meta(token_lora_tensor) - self.b_seq_start_tensor[:b_seq_start_tensor.shape[0]].copy_( + self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( b_seq_start_tensor) - self.seq_length_tensor[:seq_length_tensor.shape[0]].copy_( - seq_length_tensor) - self.lora_indices_tensor[:lora_indices_tensor.shape[0]].copy_( + self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) + self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( lora_indices_tensor) self.batch_size = batch_size self.max_length = max_length @@ -292,18 +291,17 @@ def prefill_metadata( self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: """ This property provides a convenient way to access the necessary - metadata for prefill-related kernel computations. It returns a tuple - containing: - 1. b_seq_start_tensor: Tensor of sequence start positions - 2. seq_length_tensor: Tensor of sequence lengths - 3. lora_indices_tensor: Tensor of lora indices + metadata for prefill-related kernel computations. + 1. seq_start_locs: Tensor of sequence start positions + 2. seq_lengths: Tensor of sequence lengths + 3. lora_indices_per_batch: Tensor of lora indices 4. batch_size: batch size after clustering identical lora indices 5. max_length: The maximum sequence length in the batch """ - return (self.b_seq_start_tensor[:self.batch_size], - self.seq_length_tensor[:self.batch_size], - self.lora_indices_tensor[:self.batch_size], self.batch_size, - self.max_length) + return (self._seq_start_locs[:self.batch_size], + self._seq_lengths[:self.batch_size], + self._lora_indices_per_batch[:self.batch_size], + self.batch_size, self.max_length) @property def token_lora_indices(self) -> torch.Tensor: From 1dc8ec0e545ae4caf7f6462724d6ce23703754eb Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 12 Jul 2024 14:31:51 +0800 Subject: [PATCH 51/71] delete libentry --- vllm/lora/ops/libentry.py | 137 ----------------------------- vllm/lora/ops/sgmv_expand.py | 3 - vllm/lora/ops/sgmv_expand_slice.py | 3 - vllm/lora/ops/sgmv_shrink.py | 3 - 4 files changed, 146 deletions(-) delete mode 100644 vllm/lora/ops/libentry.py diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py deleted file mode 100644 index d3fcc1d8e05b..000000000000 --- a/vllm/lora/ops/libentry.py +++ /dev/null @@ -1,137 +0,0 @@ -# Modified from: https://github.com/FlagOpen/FlagGems -import inspect - -import triton - - -class LibEntry(triton.KernelInterface): - - def __init__( - self, - fn, - ): - self.fn = fn - self.arg_names = fn.arg_names - self.divisibility = 16 - self.kernel_cache = dict() - fn = self.fn - while not isinstance(fn, triton.runtime.JITFunction): - fn = fn.fn - self.jit_function: triton.runtime.JITFunction = fn - self.specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and not p.do_not_specialize - ] - self.do_not_specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and p.do_not_specialize - ] - - def key(self, spec_args, dns_args, const_args): - spec_key = [(arg.dtype, arg.data_ptr() % - self.divisibility == 0) if hasattr(arg, "data_ptr") else - (type(arg), arg) for arg in spec_args] - dns_key = [ - arg.dtype if hasattr( - arg, "data_ptr") else type(arg) if not isinstance(arg, int) - else "i32" if -(2**31) <= arg and arg <= 2**31 - - 1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64" - for arg in dns_args - ] - # const args passed by position - return tuple(spec_key + dns_key + const_args) - - def run(self, *args, **kwargs): - grid = kwargs["grid"] - - # collect all the arguments - spec_args = [] # specialize arguments - dns_args = [] # do not specialize arguments - const_args = [] # constexpr arguments - k_args = [] # kernel arguments - for i, arg in enumerate(args): - if i in self.specialize_indices: - k_args.append(arg) - spec_args.append(arg) - elif i in self.do_not_specialize_indices: - k_args.append(arg) - dns_args.append(arg) - else: - const_args.append(arg) - for p in self.jit_function.params[len(args):]: - if p.name in kwargs: - val = kwargs[p.name] - elif p.default is inspect._empty: - continue - else: - val = p.default - - if p.is_constexpr: - const_args.append(val) - elif p.do_not_specialize: - dns_args.append(val) - k_args.append(val) - else: - spec_args.append(val) - k_args.append(val) - - entry_key = self.key(spec_args, dns_args, const_args) - - if entry_key not in self.kernel_cache: - kernel = self.fn.run(*args, **kwargs) - fn = self.fn - # collect constexpr arguments for grid computation - constexprs = {} - while not isinstance(fn, triton.runtime.JITFunction): - if isinstance(fn, triton.runtime.Autotuner): - config = fn.best_config - constexprs["num_warps"] = config.num_warps - constexprs["num_stages"] = config.num_stages - constexprs["num_ctas"] = config.num_ctas - constexprs = {**constexprs, **config.kwargs} - elif isinstance(fn, triton.runtime.Heuristics): - for v, heur in fn.values.items(): - constexprs[v] = heur({ - **dict(zip(fn.arg_names, args)), - **kwargs, - **constexprs, - }) - else: - raise RuntimeError("Invalid Runtime Function") - fn = fn.fn - for p in self.jit_function.params: - if p.is_constexpr and p.name not in constexprs: - constexprs[p.name] = p.default - self.kernel_cache[entry_key] = (kernel, constexprs) - else: - kernel, constexprs = self.kernel_cache[entry_key] - - if callable(grid): - # collect all arguments to the grid fn,ie: - # 1. args, - # 2. kwargs, - # 3. all all other captured arguments in CompiledKernel from - # Autotunner & Heuristics when kwargs & captured args conflict, - # captured args have higher priority - meta = {**dict(zip(self.arg_names, args)), **kwargs, **constexprs} - grid = grid(meta) - - grid = grid + (1, 1) - - kernel[grid[0:3]](*k_args) - return - - -def libentry(): - """ - Decorator for triton library entries. - Motivation: - The runtime overhead of Triton kernels is the reason for the lower - performance of small kernels, particularly evident with smaller models. - Using this decorator can reduce Triton runtime overhead. - """ - - def decorator(fn): - return LibEntry(fn) - - return decorator diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 27e91f5d1e4e..2873882bc263 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry - -@libentry() @triton.jit def _sgmv_expand_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 2906500e7873..2078a47d7e8e 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry - -@libentry() @triton.jit def _sgmv_expand_slice_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index c5bc1c08364c..094bc62d9da4 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry - -@libentry() @triton.jit def _sgmv_shrink_kernel( input_ptr, From 9585adba77f7012247cf587f16fa6ab224d3f1ea Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 15 Jul 2024 18:20:05 +0800 Subject: [PATCH 52/71] delete punica_c code --- .github/workflows/scripts/build.sh | 2 - CMakeLists.txt | 62 - Dockerfile | 2 - Dockerfile.rocm | 3 +- csrc/punica/LICENSE | 217 --- csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu | 5 - csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu | 5 - csrc/punica/bgmv/bgmv_config.h | 218 --- csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu | 5 - csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu | 5 - csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu | 5 - csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu | 5 - csrc/punica/bgmv/bgmv_impl.cuh | 451 ------ csrc/punica/bgmv/generator.py | 48 - csrc/punica/bgmv/vec_dtypes.cuh | 1325 ------------------ csrc/punica/punica_ops.cu | 569 -------- csrc/punica/punica_ops.h | 11 - csrc/punica/torch_bindings.cpp | 18 - csrc/punica/type_convert.h | 82 -- docs/source/getting_started/installation.rst | 1 - setup.py | 10 - vllm/envs.py | 5 - 22 files changed, 1 insertion(+), 3053 deletions(-) delete mode 100644 csrc/punica/LICENSE delete mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu delete mode 100644 csrc/punica/bgmv/bgmv_config.h delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu delete mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu delete mode 100644 csrc/punica/bgmv/bgmv_impl.cuh delete mode 100644 csrc/punica/bgmv/generator.py delete mode 100644 csrc/punica/bgmv/vec_dtypes.cuh delete mode 100644 csrc/punica/punica_ops.cu delete mode 100644 csrc/punica/punica_ops.h delete mode 100644 csrc/punica/torch_bindings.cpp delete mode 100644 csrc/punica/type_convert.h diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 60a3978f9abd..0a759d303238 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 -# Make sure punica is built for the release (for LoRA) -export VLLM_INSTALL_PUNICA_KERNELS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" # Build diff --git a/CMakeLists.txt b/CMakeLists.txt index ced73ca03bfb..df504a022cdf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -220,61 +220,7 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) -# -# _punica_C extension -# - -set(VLLM_PUNICA_EXT_SRC - "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu" - "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu" - "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu" - "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu" - "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu" - "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu" - "csrc/punica/punica_ops.cu" - "csrc/punica/torch_bindings.cpp") - -# -# Copy GPU compilation flags+update for punica -# -set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS}) -list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS - "-D__CUDA_NO_HALF_OPERATORS__" - "-D__CUDA_NO_HALF_CONVERSIONS__" - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" - "-D__CUDA_NO_HALF2_OPERATORS__") - -# -# Filter out CUDA architectures < 8.0 for punica. -# -if (${VLLM_GPU_LANG} STREQUAL "CUDA") - set(VLLM_PUNICA_GPU_ARCHES) - foreach(ARCH ${VLLM_GPU_ARCHES}) - string_to_ver(CODE_VER ${ARCH}) - if (CODE_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH}) - endif() - endforeach() - message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}") -elseif(${VLLM_GPU_LANG} STREQUAL "HIP") - set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES}) - message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}") -endif() -if (VLLM_PUNICA_GPU_ARCHES) - define_gpu_extension_target( - _punica_C - DESTINATION vllm - LANGUAGE ${VLLM_GPU_LANG} - SOURCES ${VLLM_PUNICA_EXT_SRC} - COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS} - ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES} - USE_SABI 3 - WITH_SOABI) -else() - message(WARNING "Unable to create _punica_C target because none of the " - "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0") -endif() # # Add the `default` target which detects which extensions should be @@ -298,12 +244,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling moe extension.") add_dependencies(default _moe_C) - # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or - # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and - # there are supported target arches. - if (VLLM_PUNICA_GPU_ARCHES AND - (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)) - message(STATUS "Enabling punica extension.") - add_dependencies(default _punica_C) - endif() endif() diff --git a/Dockerfile b/Dockerfile index 7fbc168ace3d..590b0554cae9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,8 +85,6 @@ ENV MAX_JOBS=${max_jobs} # number of threads used by nvcc ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads -# make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 ARG buildkite_commit ENV BUILDKITE_COMMIT=${buildkite_commit} diff --git a/Dockerfile.rocm b/Dockerfile.rocm index befb0499f2e6..7e29a73010ab 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -150,8 +150,7 @@ RUN case "$(which python3)" in \ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --upgrade numba scipy huggingface-hub[cli] -# Make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 + # Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 # Silences the HF Tokenizers warning diff --git a/csrc/punica/LICENSE b/csrc/punica/LICENSE deleted file mode 100644 index a46e2cdcadf7..000000000000 --- a/csrc/punica/LICENSE +++ /dev/null @@ -1,217 +0,0 @@ -Contains code from https://github.com/punica-ai/punica - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- - -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses/ -for text of these licenses. - - -Apache-2.0 -* third_party/nvbench (with LLVM exception) -* third_party/flashinfer - -BSD-3-Clause: -* third_party/cutlass \ No newline at end of file diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu deleted file mode 100644 index 86846c274c90..000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu deleted file mode 100644 index de39c3121f5d..000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h deleted file mode 100644 index 2c8d007d8719..000000000000 --- a/csrc/punica/bgmv/bgmv_config.h +++ /dev/null @@ -1,218 +0,0 @@ -#pragma once - -template -void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale); - -// clang-format off - -#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \ - f(in_T, out_T, W_T, narrow, 128) \ - f(in_T, out_T, W_T, narrow, 256) \ - f(in_T, out_T, W_T, narrow, 512) \ - f(in_T, out_T, W_T, narrow, 640) \ - f(in_T, out_T, W_T, narrow, 768) \ - f(in_T, out_T, W_T, narrow, 896) \ - f(in_T, out_T, W_T, narrow, 1024) \ - f(in_T, out_T, W_T, narrow, 1152) \ - f(in_T, out_T, W_T, narrow, 1216) \ - f(in_T, out_T, W_T, narrow, 1280) \ - f(in_T, out_T, W_T, narrow, 1536) \ - f(in_T, out_T, W_T, narrow, 1664) \ - f(in_T, out_T, W_T, narrow, 1728) \ - f(in_T, out_T, W_T, narrow, 1792) \ - f(in_T, out_T, W_T, narrow, 2048) \ - f(in_T, out_T, W_T, narrow, 2240) \ - f(in_T, out_T, W_T, narrow, 2304) \ - f(in_T, out_T, W_T, narrow, 2368) \ - f(in_T, out_T, W_T, narrow, 2432) \ - f(in_T, out_T, W_T, narrow, 2560) \ - f(in_T, out_T, W_T, narrow, 2752) \ - f(in_T, out_T, W_T, narrow, 2816) \ - f(in_T, out_T, W_T, narrow, 3072) \ - f(in_T, out_T, W_T, narrow, 3328) \ - f(in_T, out_T, W_T, narrow, 3456) \ - f(in_T, out_T, W_T, narrow, 3584) \ - f(in_T, out_T, W_T, narrow, 3712) \ - f(in_T, out_T, W_T, narrow, 4096) \ - f(in_T, out_T, W_T, narrow, 4480) \ - f(in_T, out_T, W_T, narrow, 4608) \ - f(in_T, out_T, W_T, narrow, 4736) \ - f(in_T, out_T, W_T, narrow, 4864) \ - f(in_T, out_T, W_T, narrow, 5120) \ - f(in_T, out_T, W_T, narrow, 5504) \ - f(in_T, out_T, W_T, narrow, 5632) \ - f(in_T, out_T, W_T, narrow, 5888) \ - f(in_T, out_T, W_T, narrow, 6144) \ - f(in_T, out_T, W_T, narrow, 6400) \ - f(in_T, out_T, W_T, narrow, 6848) \ - f(in_T, out_T, W_T, narrow, 6912) \ - f(in_T, out_T, W_T, narrow, 7168) \ - f(in_T, out_T, W_T, narrow, 7424) \ - f(in_T, out_T, W_T, narrow, 8192) \ - f(in_T, out_T, W_T, narrow, 8960) \ - f(in_T, out_T, W_T, narrow, 9216) \ - f(in_T, out_T, W_T, narrow, 9472) \ - f(in_T, out_T, W_T, narrow, 10240) \ - f(in_T, out_T, W_T, narrow, 11008) \ - f(in_T, out_T, W_T, narrow, 11264) \ - f(in_T, out_T, W_T, narrow, 12288) \ - f(in_T, out_T, W_T, narrow, 13696) \ - f(in_T, out_T, W_T, narrow, 13824) \ - f(in_T, out_T, W_T, narrow, 14336) \ - f(in_T, out_T, W_T, narrow, 14784) \ - f(in_T, out_T, W_T, narrow, 14848) \ - f(in_T, out_T, W_T, narrow, 15360) \ - f(in_T, out_T, W_T, narrow, 16384) \ - f(in_T, out_T, W_T, narrow, 18944) \ - f(in_T, out_T, W_T, narrow, 20480) \ - f(in_T, out_T, W_T, narrow, 22016) \ - f(in_T, out_T, W_T, narrow, 22528) \ - f(in_T, out_T, W_T, narrow, 24576) \ - f(in_T, out_T, W_T, narrow, 27392) \ - f(in_T, out_T, W_T, narrow, 27648) \ - f(in_T, out_T, W_T, narrow, 28672) \ - f(in_T, out_T, W_T, narrow, 29568) \ - f(in_T, out_T, W_T, narrow, 29696) \ - f(in_T, out_T, W_T, narrow, 32000) \ - f(in_T, out_T, W_T, narrow, 32256) \ - f(in_T, out_T, W_T, narrow, 32512) \ - f(in_T, out_T, W_T, narrow, 32768) \ - f(in_T, out_T, W_T, narrow, 33024) \ - f(in_T, out_T, W_T, narrow, 36864) \ - f(in_T, out_T, W_T, narrow, 43264) \ - f(in_T, out_T, W_T, narrow, 49152) \ - f(in_T, out_T, W_T, narrow, 49408) \ - f(in_T, out_T, W_T, narrow, 60544) \ - f(in_T, out_T, W_T, narrow, 60672) \ - f(in_T, out_T, W_T, narrow, 64000) \ - f(in_T, out_T, W_T, narrow, 64256) \ - f(in_T, out_T, W_T, narrow, 64512) \ - f(in_T, out_T, W_T, narrow, 102400) \ - f(in_T, out_T, W_T, narrow, 102656) \ - f(in_T, out_T, W_T, narrow, 102912) \ - f(in_T, out_T, W_T, narrow, 128000) \ - f(in_T, out_T, W_T, narrow, 128256) \ - f(in_T, out_T, W_T, narrow, 128512) \ - - -// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA -// and vllm/tests/lora/test_punica.py - -// Used for defining kernels going from the variety of -// dim in to the narrow dim out - // Using it for the fully sharded column - // parallel LoRA A which splits the rank dim -#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \ - f(in_T, out_T, W_T, 128, narrow) \ - f(in_T, out_T, W_T, 256, narrow) \ - f(in_T, out_T, W_T, 512, narrow) \ - f(in_T, out_T, W_T, 640, narrow) \ - f(in_T, out_T, W_T, 768, narrow) \ - f(in_T, out_T, W_T, 896, narrow) \ - f(in_T, out_T, W_T, 1024, narrow) \ - f(in_T, out_T, W_T, 1152, narrow) \ - f(in_T, out_T, W_T, 1216, narrow) \ - f(in_T, out_T, W_T, 1280, narrow) \ - f(in_T, out_T, W_T, 1536, narrow) \ - f(in_T, out_T, W_T, 1664, narrow) \ - f(in_T, out_T, W_T, 1728, narrow) \ - f(in_T, out_T, W_T, 1792, narrow) \ - f(in_T, out_T, W_T, 2048, narrow) \ - f(in_T, out_T, W_T, 2240, narrow) \ - f(in_T, out_T, W_T, 2304, narrow) \ - f(in_T, out_T, W_T, 2368, narrow) \ - f(in_T, out_T, W_T, 2432, narrow) \ - f(in_T, out_T, W_T, 2560, narrow) \ - f(in_T, out_T, W_T, 2752, narrow) \ - f(in_T, out_T, W_T, 2816, narrow) \ - f(in_T, out_T, W_T, 3072, narrow) \ - f(in_T, out_T, W_T, 3328, narrow) \ - f(in_T, out_T, W_T, 3456, narrow) \ - f(in_T, out_T, W_T, 3584, narrow) \ - f(in_T, out_T, W_T, 3712, narrow) \ - f(in_T, out_T, W_T, 4096, narrow) \ - f(in_T, out_T, W_T, 4480, narrow) \ - f(in_T, out_T, W_T, 4608, narrow) \ - f(in_T, out_T, W_T, 4736, narrow) \ - f(in_T, out_T, W_T, 4864, narrow) \ - f(in_T, out_T, W_T, 5120, narrow) \ - f(in_T, out_T, W_T, 5504, narrow) \ - f(in_T, out_T, W_T, 5632, narrow) \ - f(in_T, out_T, W_T, 5888, narrow) \ - f(in_T, out_T, W_T, 6144, narrow) \ - f(in_T, out_T, W_T, 6400, narrow) \ - f(in_T, out_T, W_T, 6848, narrow) \ - f(in_T, out_T, W_T, 6912, narrow) \ - f(in_T, out_T, W_T, 7168, narrow) \ - f(in_T, out_T, W_T, 7424, narrow) \ - f(in_T, out_T, W_T, 8192, narrow) \ - f(in_T, out_T, W_T, 8960, narrow) \ - f(in_T, out_T, W_T, 9216, narrow) \ - f(in_T, out_T, W_T, 9472, narrow) \ - f(in_T, out_T, W_T, 10240, narrow) \ - f(in_T, out_T, W_T, 11008, narrow) \ - f(in_T, out_T, W_T, 11264, narrow) \ - f(in_T, out_T, W_T, 12288, narrow) \ - f(in_T, out_T, W_T, 13696, narrow) \ - f(in_T, out_T, W_T, 13824, narrow) \ - f(in_T, out_T, W_T, 14336, narrow) \ - f(in_T, out_T, W_T, 14784, narrow) \ - f(in_T, out_T, W_T, 14848, narrow) \ - f(in_T, out_T, W_T, 15360, narrow) \ - f(in_T, out_T, W_T, 16384, narrow) \ - f(in_T, out_T, W_T, 18944, narrow) \ - f(in_T, out_T, W_T, 20480, narrow) \ - f(in_T, out_T, W_T, 22016, narrow) \ - f(in_T, out_T, W_T, 22528, narrow) \ - f(in_T, out_T, W_T, 24576, narrow) \ - f(in_T, out_T, W_T, 27392, narrow) \ - f(in_T, out_T, W_T, 27648, narrow) \ - f(in_T, out_T, W_T, 28672, narrow) \ - f(in_T, out_T, W_T, 29568, narrow) \ - f(in_T, out_T, W_T, 29696, narrow) \ - f(in_T, out_T, W_T, 32000, narrow) \ - f(in_T, out_T, W_T, 32256, narrow) \ - f(in_T, out_T, W_T, 32512, narrow) \ - f(in_T, out_T, W_T, 32768, narrow) \ - f(in_T, out_T, W_T, 33024, narrow) \ - f(in_T, out_T, W_T, 36864, narrow) \ - f(in_T, out_T, W_T, 43264, narrow) \ - f(in_T, out_T, W_T, 49152, narrow) \ - f(in_T, out_T, W_T, 49408, narrow) \ - f(in_T, out_T, W_T, 60544, narrow) \ - f(in_T, out_T, W_T, 60672, narrow) \ - f(in_T, out_T, W_T, 64000, narrow) \ - f(in_T, out_T, W_T, 64256, narrow) \ - f(in_T, out_T, W_T, 64512, narrow) \ - f(in_T, out_T, W_T, 102400, narrow) \ - f(in_T, out_T, W_T, 102656, narrow) \ - f(in_T, out_T, W_T, 102912, narrow) \ - f(in_T, out_T, W_T, 128000, narrow) \ - f(in_T, out_T, W_T, 128256, narrow) \ - f(in_T, out_T, W_T, 128512, narrow) \ -// Keep above in sync with vllm/lora/layers::SamplerWithLoRA - - -// Keep this in sync with vllm/config::LoRAConfig -#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64) - - -#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \ - FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \ - FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \ - FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \ - f(in_T, out_T, W_T, 8, 64) \ - f(in_T, out_T, W_T, 16, 64) \ - f(in_T, out_T, W_T, 32, 64) \ - f(in_T, out_T, W_T, 64, 64) - -// clang-format on diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu deleted file mode 100644 index d225a1eaa82b..000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu deleted file mode 100644 index b37d288a7556..000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu deleted file mode 100644 index a1ab2deecbab..000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu deleted file mode 100644 index 0b35bf569989..000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu +++ /dev/null @@ -1,5 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half) diff --git a/csrc/punica/bgmv/bgmv_impl.cuh b/csrc/punica/bgmv/bgmv_impl.cuh deleted file mode 100644 index 8a3b8403b4a6..000000000000 --- a/csrc/punica/bgmv/bgmv_impl.cuh +++ /dev/null @@ -1,451 +0,0 @@ -#pragma once - -#include -#ifndef USE_ROCM -#include -#else -#include -#endif -#ifndef USE_ROCM -#include -#endif -#include -#include -#include - -#include "vec_dtypes.cuh" - -namespace cg = cooperative_groups; - -#ifdef USE_ROCM -template -__host__ __device__ -inline void* memcpy_blocking(void *dst, const void *src) { - // Does not handle the case of long datatypes - char *d = reinterpret_cast(dst); - const char *s = reinterpret_cast(src); - size_t i = 0; -#pragma unroll - for (i = 0; i < len; ++i) { - d[i] = s[i]; - } - return dst; -} -#endif - -#ifndef USE_ROCM - -// nthrs = (32, 4) -template -__global__ void -bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t num_layers, int64_t layer_idx, - float scale) { - size_t batch_idx = blockIdx.y; - int64_t idx = indicies[batch_idx] * num_layers + layer_idx; - if (idx < 0) { - return; - } - - auto block = cg::this_thread_block(); - size_t j = blockIdx.x; - constexpr size_t num_pipeline_stages = 2; - constexpr size_t tile_size = tx * ty * vec_size; - __shared__ W_T W_shared[num_pipeline_stages * tile_size]; - __shared__ in_T X_shared[num_pipeline_stages * tile_size]; - __shared__ float y_warpwise[ty]; - - size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size}; - size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size}; - auto pipe = cuda::make_pipeline(); - - // pipeline load W/X and compute WX; - pipe.producer_acquire(); - cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size, - W + (idx * feat_out + j) * feat_in + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t(W_copy_size), pipe); - cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size, - X + (batch_idx * feat_in) + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t(X_copy_size), pipe); - pipe.producer_commit(); - size_t copy_idx, compute_idx; - float y = 0.f; - vec_t x_vec; - vec_t w_vec; - size_t tile_idx; - -#pragma unroll - for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size; - ++tile_idx) { - copy_idx = tile_idx % num_pipeline_stages; - // pipeline stage: async copy W fragment - pipe.producer_acquire(); - if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) { - cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size, - W + (idx * feat_out + j) * feat_in + - tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t(W_copy_size), pipe); - cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size, - X + (batch_idx * feat_in) + tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t(X_copy_size), pipe); - } - pipe.producer_commit(); - - compute_idx = (tile_idx - 1) % num_pipeline_stages; - // pipeline stage: compute WX - pipe.consumer_wait(); - block.sync(); - x_vec.load(X_shared + X_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - w_vec.load(W_shared + W_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += float(w_vec[i]) * float(x_vec[i]) * scale; - } -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += __shfl_down_sync(0xffffffff, sum, offset); - } - y_warpwise[threadIdx.y] = sum; - block.sync(); -#pragma unroll - for (size_t i = 0; i < ty; ++i) { - y += y_warpwise[i]; - } - - block.sync(); - pipe.consumer_release(); - } - - compute_idx = (tile_idx - 1) % num_pipeline_stages; - // final pipeline stage - pipe.consumer_wait(); - block.sync(); - x_vec.load(X_shared + X_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - w_vec.load(W_shared + W_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += float(w_vec[i]) * float(x_vec[i]) * scale; - } -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += __shfl_down_sync(0xffffffff, sum, offset); - } - y_warpwise[threadIdx.y] = - ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in) - ? sum - : 0.f; - block.sync(); -#pragma unroll - for (size_t i = 0; i < ty; ++i) { - y += y_warpwise[i]; - } - - block.sync(); - pipe.consumer_release(); - - // write Y; - if (block.thread_rank() == 0) { - Y[batch_idx * full_y_size + y_offset + j] += static_cast(y); - } -} - -#else - -template -__global__ void -bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t num_layers, int64_t layer_idx, - float scale) { - size_t batch_idx = blockIdx.y; - int64_t idx = indicies[batch_idx] * num_layers + layer_idx; - if (idx < 0) { - return; - } - - size_t j = blockIdx.x; - constexpr size_t tile_size = tx * ty * vec_size; - constexpr size_t num_tiles = (feat_in + tile_size - 1) / tile_size; - __shared__ float y_warpwise[ty]; - - float y = 0; - vec_t x_vec; - vec_t w_vec; - size_t tile_idx; - -#pragma unroll - for (tile_idx = 0; tile_idx < num_tiles; ++tile_idx) { - if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) { - x_vec.load(X + (batch_idx * feat_in) + - tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size); - w_vec.load(W + (idx * feat_out + j) * feat_in + - tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size); - } - - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += convert_type(w_vec[i]) * convert_type(x_vec[i]) * scale; - } -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += VLLM_SHFL_DOWN_SYNC(sum, offset); - } - - __syncthreads(); - - if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) { - y += sum; - } - } - - if (threadIdx.x == 0) { - y_warpwise[threadIdx.y] = y; - } - __syncthreads(); - - float y_write = 0.f; -#pragma unroll - for (size_t i = 0; i < ty; ++i) { - y_write += y_warpwise[i]; - } - - // write Y; - if (threadIdx.x == 0 && threadIdx.y == 0) { - size_t y_idx = batch_idx * full_y_size + y_offset + j; - Y[y_idx] = vllm_add(Y[y_idx], convert_type(y_write)); - } -} - -#endif - -// nthrs = (2, 16, 4) -template -__global__ void -bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t num_layers, int64_t layer_idx, - float scale) { - size_t batch_idx = blockIdx.y; - int64_t idx = indicies[batch_idx] * num_layers + layer_idx; - - if (idx < 0) { - return; - } - - auto block = cg::this_thread_block(); - size_t tile_idx = blockIdx.x; - - // load X; - vec_t x_vec; - x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size); - - // load W; - vec_t w_vec; - w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in + - block.thread_rank() * vec_size); - - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { -#ifndef USE_ROCM - sum += float(w_vec[i]) * float(x_vec[i]) * scale; -#else - sum += convert_type(w_vec[i]) * convert_type(x_vec[i]) * scale; -#endif - } - - cg::thread_block_tile g = cg::tiled_partition(block); -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += g.shfl_down(sum, offset); - } - sum = g.shfl(sum, 0); - - if (threadIdx.x == 0) { -#ifndef USE_ROCM - Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) + - threadIdx.z * ty + threadIdx.y] += static_cast(sum); -#else - size_t y_idx = batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) + - threadIdx.z * ty + threadIdx.y; - Y[y_idx] = vllm_add(Y[y_idx], convert_type(sum)); -#endif - } -} - -template -void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale) { - constexpr size_t vec_size = 8; - constexpr int tz = 4; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - if constexpr (feat_in <= feat_out) { - static_assert(feat_in % vec_size == 0); - constexpr int tx = feat_in / vec_size; - - static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) || - (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) || - (8 % tx == 0 && feat_out % (8 / tx * tz) == 0)); - - if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) { - constexpr int ty = 32 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) { - constexpr int ty = 16 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else { - constexpr int ty = 8 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } - } else { -#ifndef USE_ROCM - static_assert(feat_in % (vec_size * 32) == 0 || - feat_in % (vec_size * 16) == 0 || - feat_in % (vec_size * 8) == 0); - - if constexpr (feat_in % (vec_size * 32) == 0) { - constexpr int tx = 32; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) { - constexpr int tx = 32; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) { - constexpr int tx = 16; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel - <<>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } -#else - constexpr size_t rocm_warp_size = warpSize; - -#define CHECK_INPUT_TILEABLE_BY(vec_size_) \ - feat_in % (rocm_warp_size * vec_size_) == 0 - -#define LAUNCH_BGMV_SHRINK_KERNELS_ROCM(factor_, vec_size_, tx_, ty_) \ - if constexpr (CHECK_INPUT_TILEABLE_BY(factor_)) { \ - constexpr size_t vec_size_shrink = vec_size_; \ - constexpr int tx = tx_; \ - constexpr int ty = ty_; \ - dim3 nblks(feat_out, batch_size); \ - dim3 nthrs(tx, ty); \ - bgmv_shrink_kernel \ - <<>>(Y, X, W, indicies, y_offset, \ - full_y_size, num_layers, layer_idx, \ - scale); \ - } - - static_assert(CHECK_INPUT_TILEABLE_BY(32) || - CHECK_INPUT_TILEABLE_BY(16) || - CHECK_INPUT_TILEABLE_BY( 8) || - CHECK_INPUT_TILEABLE_BY( 4) || - CHECK_INPUT_TILEABLE_BY( 2) || - CHECK_INPUT_TILEABLE_BY( 1)); - - LAUNCH_BGMV_SHRINK_KERNELS_ROCM(32, vec_size, rocm_warp_size, 32/vec_size) - else - LAUNCH_BGMV_SHRINK_KERNELS_ROCM(16, vec_size, rocm_warp_size, 16/vec_size) - else - LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 8, vec_size, rocm_warp_size, 8/vec_size) - else - LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 4, vec_size, rocm_warp_size/(vec_size/4), vec_size/4) - else - LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 2, vec_size, rocm_warp_size/(vec_size/2), vec_size/2) - else - LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 1, vec_size, rocm_warp_size/(vec_size/1), vec_size/1) - -#undef CHECK_INPUT_TILEABLE_BY -#undef LAUNCH_BGMV_SHRINK_KERNELS_ROCM -#endif - } -} - -#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T) \ - template void bgmv_kernel( \ - out_T * __restrict__ Y, const in_T *__restrict__ X, \ - const W_T *__restrict__ W, const int64_t *__restrict__ indicies, \ - int64_t y_offset, int64_t full_y_size, int64_t batch_size, \ - int64_t num_layers, int64_t layer_idx, float scale); - -#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out) \ - INST_BGMV(feat_in, feat_out, in_T, out_T, W_T) - -#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide) \ - INST_BGMV(narrow, wide, in_T, out_T, W_T) \ - INST_BGMV(wide, narrow, in_T, out_T, W_T) diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py deleted file mode 100644 index 972df5a7208c..000000000000 --- a/csrc/punica/bgmv/generator.py +++ /dev/null @@ -1,48 +0,0 @@ -DTYPES = ["fp16", "bf16", "fp32"] -DTYPE_MAP = { - "fp16": "nv_half", - "bf16": "nv_bfloat16", - "fp32": "float", -} - -TEMPLATE = """ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -""".lstrip() # noqa: E501 - -for input_dtype in DTYPES: - for output_dtype in DTYPES: - for weight_dtype in DTYPES: - if weight_dtype == "fp32": - # FP32 weights are not supported. - continue - if output_dtype == "fp32": - # LoRA A matrix. - if input_dtype != weight_dtype: - # NOTE(woosuk): While Punica supports the case where the - # input and weight dtypes are different, we only generate - # the kernels the same dtypes to reduce the binary size. - continue - elif input_dtype == "fp32": - # LoRA B matrix. - if output_dtype != weight_dtype: - # NOTE(woosuk): While Punica supports the case where the - # output and weight dtypes are different, we only generate - # the kernels the same dtypes to reduce the binary size. - continue - elif not (input_dtype == output_dtype == weight_dtype): - # NOTE(woosuk): While Punica supports mixed data types for - # input, output, and weight, we only generate the kernels with - # the same data types to reduce the binary size. - continue - - kernel_definition = TEMPLATE.format( - input_dtype=DTYPE_MAP[input_dtype], - output_dtype=DTYPE_MAP[output_dtype], - weight_dtype=DTYPE_MAP[weight_dtype]) - filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu" - with open(filename, "w") as f: - f.write(kernel_definition) diff --git a/csrc/punica/bgmv/vec_dtypes.cuh b/csrc/punica/bgmv/vec_dtypes.cuh deleted file mode 100644 index 2738892e6dc4..000000000000 --- a/csrc/punica/bgmv/vec_dtypes.cuh +++ /dev/null @@ -1,1325 +0,0 @@ -#ifndef VEC_DTYPES_CUH_ -#define VEC_DTYPES_CUH_ - -#ifdef FLASHINFER_USE_FP8 -#include -#endif -#include - -#include - -#include "../type_convert.h" -#include "../../cuda_compat.h" - -#define FLASHINFER_INLINE \ - inline __attribute__((always_inline)) __device__ __host__ - -template -struct vec_t { - FLASHINFER_INLINE float_t &operator[](size_t i); - FLASHINFER_INLINE const float_t &operator[](size_t i) const; - FLASHINFER_INLINE void fill(float_t val); - FLASHINFER_INLINE void load(const float_t *ptr); - FLASHINFER_INLINE void store(float_t *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src); - template - FLASHINFER_INLINE void cast_load(const T *ptr); - template - FLASHINFER_INLINE void cast_store(T *ptr) const; - FLASHINFER_INLINE static void memcpy(float_t *dst, const float_t *src); -}; - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t &dst) { -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - dst[i] = tgt_float_t(src[i]); - } -} - -template -FLASHINFER_INLINE void cast_load_impl(const src_float_t *src_ptr, - vec_t &dst) { - if constexpr (std::is_same::value) { - dst.load(src_ptr); - } else { - vec_t tmp; - tmp.load(src_ptr); - dst.cast_from(tmp); - } -} - -template -FLASHINFER_INLINE void cast_store_impl(const vec_t &src, - tgt_float_t *dst_ptr) { - if constexpr (std::is_same::value) { - src.store(dst_ptr); - } else { - vec_t tmp; - tmp.cast_from(src); - tmp.store(dst_ptr); - } -} - -#ifdef FLASHINFER_USE_FP8 -/******************* vec_t<__nv_fp8_e4m3> *******************/ - -// __nv_fp8_e4m3 x 1 -template <> -struct vec_t<__nv_fp8_e4m3, 1> { - __nv_fp8_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store( - __nv_fp8_e4m3 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *dst = *src; -} - -// __nv_fp8_e4m3 x 2 -template <> -struct vec_t<__nv_fp8_e4m3, 2> { - __nv_fp8x2_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) { - data.__x = - (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3 *ptr) { - data = *((__nv_fp8x2_e4m3 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store( - __nv_fp8_e4m3 *ptr) const { - *((__nv_fp8x2_e4m3 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8x2_e4m3 *)dst) = *((__nv_fp8x2_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 4 - -template <> -struct vec_t<__nv_fp8_e4m3, 4> { - __nv_fp8x4_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) { - data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3 *ptr) { - data = *((__nv_fp8x4_e4m3 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store( - __nv_fp8_e4m3 *ptr) const { - *((__nv_fp8x4_e4m3 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8x4_e4m3 *)dst) = *((__nv_fp8x4_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 8 - -template <> -struct vec_t<__nv_fp8_e4m3, 8> { - uint2 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) { - ((__nv_fp8x4_e4m3 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store( - __nv_fp8_e4m3 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8_e4m3 *)dst) = *((__nv_fp8_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 16 or more -template -struct vec_t<__nv_fp8_e4m3, vec_size> { - uint4 data[vec_size / 16]; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)data)[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)data)[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((__nv_fp8x4_e4m3 *)(&(data[i].x)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].y)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].z)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].w)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - } - } - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t<__nv_fp8_e5m2> *******************/ - -// __nv_fp8_e5m2 x 1 -template <> -struct vec_t<__nv_fp8_e5m2, 1> { - __nv_fp8_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store( - __nv_fp8_e5m2 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *dst = *src; -} - -// __nv_fp8_e5m2 x 2 -template <> -struct vec_t<__nv_fp8_e5m2, 2> { - __nv_fp8x2_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) { - data.__x = - (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2 *ptr) { - data = *((__nv_fp8x2_e5m2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store( - __nv_fp8_e5m2 *ptr) const { - *((__nv_fp8x2_e5m2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8x2_e5m2 *)dst) = *((__nv_fp8x2_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 4 - -template <> -struct vec_t<__nv_fp8_e5m2, 4> { - __nv_fp8x4_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) { - data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2 *ptr) { - data = *((__nv_fp8x4_e5m2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store( - __nv_fp8_e5m2 *ptr) const { - *((__nv_fp8x4_e5m2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8x4_e5m2 *)dst) = *((__nv_fp8x4_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 8 - -template <> -struct vec_t<__nv_fp8_e5m2, 8> { - uint2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) { - ((__nv_fp8x4_e5m2 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store( - __nv_fp8_e5m2 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8_e5m2 *)dst) = *((__nv_fp8_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 16 or more - -template -struct vec_t<__nv_fp8_e5m2, vec_size> { - uint4 data[vec_size / 16]; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)data)[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)data)[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((__nv_fp8x4_e5m2 *)(&(data[i].x)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].y)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].z)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].w)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - } - } - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; -#endif - -/******************* vec_t *******************/ - -// half x 1 -template <> -struct vec_t { - half data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t::fill(half val) { data = val; } - -FLASHINFER_INLINE void vec_t::load(const half *ptr) { data = *ptr; } - -FLASHINFER_INLINE void vec_t::store(half *ptr) const { *ptr = data; } - -FLASHINFER_INLINE void vec_t::memcpy(half *dst, const half *src) { - *dst = *src; -} - -// half x 2 -template <> -struct vec_t { - half2 data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t::fill(half val) { - data = make_half2(val, val); -} - -FLASHINFER_INLINE void vec_t::load(const half *ptr) { - data = *((half2 *)ptr); -} - -FLASHINFER_INLINE void vec_t::store(half *ptr) const { - *((half2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(half *dst, const half *src) { - *((half2 *)dst) = *((half2 *)src); -} - -// half x 4 - -template <> -struct vec_t { - uint2 data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t::fill(half val) { - *(half2 *)(&data.x) = make_half2(val, val); - *(half2 *)(&data.y) = make_half2(val, val); -} - -FLASHINFER_INLINE void vec_t::load(const half *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t::store(half *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(half *dst, const half *src) { - *((uint2 *)dst) = *((uint2 *)src); -} - -// half x 8 or more - -template -struct vec_t { - uint4 data[vec_size / 8]; - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)data)[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)data)[i]; - } - FLASHINFER_INLINE void fill(half val) { -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - *(half2 *)(&(data[i].x)) = make_half2(val, val); - *(half2 *)(&(data[i].y)) = make_half2(val, val); - *(half2 *)(&(data[i].z)) = make_half2(val, val); - *(half2 *)(&(data[i].w)) = make_half2(val, val); - } - } - FLASHINFER_INLINE void load(const half *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(half *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t *******************/ - -// nv_bfloat16 x 1 -template <> -struct vec_t { - nv_bfloat16 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t::fill(nv_bfloat16 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t::load(const nv_bfloat16 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t::store(nv_bfloat16 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *dst = *src; -} - -// nv_bfloat16 x 2 -template <> -struct vec_t { - nv_bfloat162 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t::fill(nv_bfloat16 val) { - data = make_bfloat162(val, val); -} - -FLASHINFER_INLINE void vec_t::load(const nv_bfloat16 *ptr) { - data = *((nv_bfloat162 *)ptr); -} - -FLASHINFER_INLINE void vec_t::store(nv_bfloat16 *ptr) const { - *((nv_bfloat162 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *((nv_bfloat162 *)dst) = *((nv_bfloat162 *)src); -} - -// nv_bfloat16 x 4 - -template <> -struct vec_t { - uint2 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t::fill(nv_bfloat16 val) { - *(nv_bfloat162 *)(&data.x) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&data.y) = make_bfloat162(val, val); -} - -FLASHINFER_INLINE void vec_t::load(const nv_bfloat16 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t::store(nv_bfloat16 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *((uint2 *)dst) = *((uint2 *)src); -} - -// nv_bfloat16 x 8 or more - -template -struct vec_t { - uint4 data[vec_size / 8]; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)data)[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)data)[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - *(nv_bfloat162 *)(&(data[i].x)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].y)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].z)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].w)) = make_bfloat162(val, val); - } - } - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t *******************/ - -// float x 1 - -template <> -struct vec_t { - float data; - - FLASHINFER_INLINE float &operator[](size_t i) { - return ((float *)(&data))[i]; - } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(&data))[i]; - } - FLASHINFER_INLINE void fill(float val); - FLASHINFER_INLINE void load(const float *ptr); - FLASHINFER_INLINE void store(float *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(float *dst, const float *src); -}; - -FLASHINFER_INLINE void vec_t::fill(float val) { data = val; } - -FLASHINFER_INLINE void vec_t::load(const float *ptr) { data = *ptr; } - -FLASHINFER_INLINE void vec_t::store(float *ptr) const { *ptr = data; } - -FLASHINFER_INLINE void vec_t::memcpy(float *dst, const float *src) { - *dst = *src; -} - -// float x 2 - -template <> -struct vec_t { - float2 data; - - FLASHINFER_INLINE float &operator[](size_t i) { - return ((float *)(&data))[i]; - } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(&data))[i]; - } - FLASHINFER_INLINE void fill(float val); - FLASHINFER_INLINE void load(const float *ptr); - FLASHINFER_INLINE void store(float *ptr) const; - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - FLASHINFER_INLINE static void memcpy(float *dst, const float *src); -}; - -FLASHINFER_INLINE void vec_t::fill(float val) { - data = make_float2(val, val); -} - -FLASHINFER_INLINE void vec_t::load(const float *ptr) { - data = *((float2 *)ptr); -} - -FLASHINFER_INLINE void vec_t::store(float *ptr) const { - *((float2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t::memcpy(float *dst, const float *src) { - *((float2 *)dst) = *((float2 *)src); -} - -// float x 4 or more -template -struct vec_t { - float4 data[vec_size / 4]; - - FLASHINFER_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(data))[i]; - } - FLASHINFER_INLINE void fill(float val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - data[i] = make_float4(val, val, val, val); - } - } - FLASHINFER_INLINE void load(const float *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - data[i] = ((float4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(float *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)ptr)[i] = data[i]; - } - } - template - FLASHINFER_INLINE void cast_from(const vec_t &src) { - cast_from_impl(src, *this); - } - template - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - FLASHINFER_INLINE static void memcpy(float *dst, const float *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)dst)[i] = ((float4 *)src)[i]; - } - } -}; - -/******************* vec_t type cast *******************/ - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((float2 *)(&dst.data))[i] = __half22float2(((half2 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = half(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = __float22half2_rn(((float2 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((float2 *)(&dst.data))[i] = - __bfloat1622float2(((nv_bfloat162 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = nv_bfloat16(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((nv_bfloat162 *)(&dst.data))[i] = - __float22bfloat162_rn(((float2 *)(&src.data))[i]); - } - } -} - -#ifdef FLASHINFER_USE_FP8 - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else if constexpr (vec_size == 2) { - *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e4m3 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t<__nv_fp8_e4m3, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(float2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = - __nv_fp8x4_e4m3(((float4 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t<__nv_fp8_e4m3, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(half2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - // NOTE(Zihao): need to double check if we properly handle flo and fhi - ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = __nv_fp8x4_e4m3( - ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else if constexpr (vec_size == 2) { - *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e5m2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src, - vec_t &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t<__nv_fp8_e5m2, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e5m2(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(float2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = - __nv_fp8x4_e5m2(((float4 *)(&src.data))[i]); - } - } -} - -template -FLASHINFER_INLINE void cast_from_impl(const vec_t &src, - vec_t<__nv_fp8_e5m2, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(half2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - // NOTE(Zihao): need to double check if we properly handle flo and fhi - ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = __nv_fp8x4_e5m2( - ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]); - } - } -} - -#endif // FLASHINFER_USE_FP8 - -#endif // VEC_DTYPES_CUH_ diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu deleted file mode 100644 index dd29820144b3..000000000000 --- a/csrc/punica/punica_ops.cu +++ /dev/null @@ -1,569 +0,0 @@ -#include -#include -#include - -#include "type_convert.h" -#include "../cuda_compat.h" -#include "bgmv/bgmv_config.h" - - -//====== utils ====== - -inline void check_shape(const torch::Tensor &a, const torch::Tensor &b, - const char *a_name, const char *b_name) { - TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ", - a.dim(), " vs ", b.dim()); - for (int i = 0; i < a.dim(); ++i) { - TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name, - ".size(", i, ")"); - } -} - -inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) { - return (uint64_t(a) << 32) | uint64_t(b); -} - -#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") - -#define CHECK_CONTIGUOUS(x) \ - TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") - -#define CHECK_INPUT(x) \ - CHECK_CUDA(x); \ - CHECK_CONTIGUOUS(x) - -#define CHECK_DIM(d, x) \ - TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor") - -#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b) - -#define CHECK_EQ(a, b) \ - TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b) - -//====== bgmv ====== - -template -inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W, - const int64_t *lora_indices, - uint32_t in_features, uint32_t out_features, - int64_t y_offset, int64_t full_y_size, - int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale) { - // NOTE(woosuk): While Punica supports various combinations of input/output - // data types, we limit the supported data types to reduce the binary size. - constexpr bool is_input_float = std::is_same::value; - constexpr bool is_output_float = std::is_same::value; - if (is_input_float) { - if (!std::is_same::value) { - return false; - } - } else if (is_output_float) { - if (!std::is_same::value) { - return false; - } - } else if (!(std::is_same::value && - std::is_same::value)) { - return false; - } - - switch (pack_u32(in_features, out_features)) { -#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out) \ - case pack_u32(feat_in, feat_out): \ - bgmv_kernel(Y, X, W, lora_indices, y_offset, \ - full_y_size, batch_size, num_layers, \ - layer_idx, scale); \ - break; -#define CASE(_in_T, _out_T, _W_T, narrow, wide) \ - CASE_ONESIDE(in_T, out_T, W_T, narrow, wide) \ - CASE_ONESIDE(in_T, out_T, W_T, wide, narrow) - - FOR_BGMV_WIDE_NARROW(CASE, _, _, _) - FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _) -#undef CASE -#undef CASE_ONESIDE - default: - return false; - } - return true; -} - -void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, double scale) { - CHECK_INPUT(y); - CHECK_INPUT(x); - CHECK_INPUT(w); - CHECK_INPUT(indicies); - - CHECK_DIM(2, y); - CHECK_DIM(2, x); - CHECK_DIM(4, w); - CHECK_DIM(1, indicies); - - int64_t B = x.size(0); - int64_t h_in = x.size(1); - int64_t h_out = y.size(1); - int64_t num_layers = w.size(1); - CHECK_EQ(w.size(3), h_in); - CHECK_EQ(w.size(2), h_out); - CHECK_EQ(indicies.size(0), x.size(0)); - CHECK_EQ(y.size(0), x.size(0)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - bool ok = false; - if (h_in <= 128512 && h_out <= 128512) { - // TODO: See if we can get rid of this massive nested switch - switch (x.scalar_type()) { - case at::ScalarType::Half: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - default: - break; - } - } - TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out, - " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type()); -} - -void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, - double scale, int64_t h_in, int64_t h_out, - int64_t y_offset) { - CHECK_INPUT(y); - CHECK_INPUT(x); - CHECK_INPUT(w); - CHECK_INPUT(indicies); - - CHECK_DIM(2, y); - CHECK_DIM(2, x); - CHECK_DIM(4, w); - CHECK_DIM(1, indicies); - - int64_t B = x.size(0); - int64_t num_layers = w.size(1); - int64_t full_y_size = y.size(1); - CHECK_EQ(w.size(3), h_in); - CHECK_EQ(w.size(2), h_out); - CHECK_EQ(indicies.size(0), x.size(0)); - CHECK_EQ(y.size(0), x.size(0)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - bool ok = false; - if (h_in <= 128512 && h_out <= 128512) { - // TODO: See if we can get rid of this massive nested switch - switch (x.scalar_type()) { - case at::ScalarType::Half: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast(y.data_ptr()), - static_cast(x.data_ptr()), - static_cast(w.data_ptr()), - indicies.data_ptr(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - default: - break; - } - } - TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out, - " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type()); -} diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h deleted file mode 100644 index 5d625d0564f7..000000000000 --- a/csrc/punica/punica_ops.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, double scale); - -void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, - double scale, int64_t h_in, int64_t h_out, - int64_t y_offset); diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp deleted file mode 100644 index 894e229b6d9d..000000000000 --- a/csrc/punica/torch_bindings.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "registration.h" -#include "punica_ops.h" - -TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { - m.def( - "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int " - "layer_idx, float scale) -> ()"); - m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv); - - m.def( - "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w," - "Tensor indicies, int layer_idx," - "float scale, int h_in, int h_out," - "int y_offset) -> ()"); - m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level); -} - -REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/punica/type_convert.h b/csrc/punica/type_convert.h deleted file mode 100644 index dff7ce49283d..000000000000 --- a/csrc/punica/type_convert.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef CSRC__PUNICA__TYPE_CONVERT_H__ -#define CSRC__PUNICA__TYPE_CONVERT_H__ - -#ifndef USE_ROCM - -#include -#include - -#else - -#include -#include - -#define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__ - -typedef __half nv_half; -typedef __hip_bfloat16 nv_bfloat16; -typedef __hip_bfloat162 nv_bfloat162; - -__TYPE_CONVERT__HOST_DEVICE__ -inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) { - return __hip_bfloat162{val, val}; -} - -__TYPE_CONVERT__HOST_DEVICE__ -inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) { - return __hip_bfloat162{vall, valr}; -} - -template -__TYPE_CONVERT__HOST_DEVICE__ -inline T_dst convert_type(T_src val) { - return static_cast(val); -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline float convert_type<__half, float>(__half val) { - return __half2float(val); -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline __half convert_type(float val) { - return __float2half(val); -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) { - return __bfloat162float(val); -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline __hip_bfloat16 convert_type(float val) { - return __float2bfloat16(val); -} - -template -__TYPE_CONVERT__HOST_DEVICE__ -inline T vllm_add(T a, T b) { - return a + b; -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline __half vllm_add<__half>(__half a, __half b) { - return __hadd(a, b); -} - -template <> -__TYPE_CONVERT__HOST_DEVICE__ -inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) { - return __hadd(a, b); -} - -#undef __TYPE_CONVERT__HOST_DEVICE__ - -#endif // USE_ROCM - -#endif // CSRC__PUNICA__TYPE_CONVERT_H__ diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index a9dfac8ff5af..99cf34622ab9 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -67,7 +67,6 @@ You can also build and install vLLM from source: $ git clone https://github.com/vllm-project/vllm.git $ cd vllm - $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability $ pip install -e . # This may take 5-10 minutes. .. tip:: diff --git a/setup.py b/setup.py index 72ef26f15e40..63c1f466d291 100644 --- a/setup.py +++ b/setup.py @@ -181,9 +181,6 @@ def configure(self, ext: CMakeExtension) -> None: # match. cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] - if _install_punica(): - cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON'] - # # Setup parallelism and build tool # @@ -274,10 +271,6 @@ def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() -def _install_punica() -> bool: - return envs.VLLM_INSTALL_PUNICA_KERNELS - - def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], @@ -446,9 +439,6 @@ def _read_requirements(filename: str) -> List[str]: if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) - if _install_punica(): - ext_modules.append(CMakeExtension(name="vllm._punica_C")) - package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } diff --git a/vllm/envs.py b/vllm/envs.py index 5b4a2010d12e..9d21d8bcea70 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -40,7 +40,6 @@ MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False - VLLM_INSTALL_PUNICA_KERNELS: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False CMAKE_BUILD_TYPE: Optional[str] = None VERBOSE: bool = False @@ -74,10 +73,6 @@ "VLLM_USE_PRECOMPILED": lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")), - # If set, vllm will install Punica kernels - "VLLM_INSTALL_PUNICA_KERNELS": - lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))), - # CMake build type # If not set, defaults to "Debug" or "RelWithDebInfo" # Available options: "Debug", "Release", "RelWithDebInfo" From d1ef5a083d3ebbae8eb4cde504fdeb7807beaece Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 16 Jul 2024 10:08:34 +0800 Subject: [PATCH 53/71] add libentry --- vllm/lora/ops/libentry.py | 97 ++++++++++++++++++++++++++++++ vllm/lora/ops/sgmv_expand.py | 3 + vllm/lora/ops/sgmv_expand_slice.py | 3 + vllm/lora/ops/sgmv_shrink.py | 3 + 4 files changed, 106 insertions(+) create mode 100644 vllm/lora/ops/libentry.py diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py new file mode 100644 index 000000000000..a0bc5de9cb07 --- /dev/null +++ b/vllm/lora/ops/libentry.py @@ -0,0 +1,97 @@ +# Copied From https://github.com/FlagOpen/FlagGems + +import triton + + + +class LibEntry(triton.KernelInterface): + def __init__( + self, + fn, + ): + self.fn = fn + self.arg_names = fn.arg_names + self.divisibility = 16 + self.config_cache = dict() + self.kernel_cache = dict() + if isinstance(fn, triton.runtime.Autotuner): + self.rt = "Autotuner" + elif isinstance(fn, triton.runtime.Heuristics): + self.rt = "Heuristics" + else: + self.rt = "JitFunction" + + def run(self, *args, **kwargs): + key = [] + for arg in args: + if hasattr(arg, "data_ptr"): + key.append(arg.dtype) + key.append(arg.data_ptr() % self.divisibility == 0) + elif isinstance(arg, int): + key.append(arg) + entry_key = tuple(key) + + config = {} + # Autotuner + if self.rt == "Autotuner": + if entry_key not in self.config_cache: + # tune + kernel = self.fn.run(*args, **kwargs) + config = self.fn.best_config.kwargs + self.config_cache[entry_key] = config + self.kernel_cache[entry_key] = kernel + return + else: + # tuned + config = self.config_cache[entry_key] + kernel = self.kernel_cache[entry_key] + # Heuristics + elif self.rt == "Heuristics": + if entry_key not in self.kernel_cache: + # compile + kernel = self.fn.run(*args, **kwargs) + self.kernel_cache[entry_key] = kernel + return + else: + # compiled + for v, heur in self.fn.values.items(): + config[v] = heur( + {**dict(zip(self.arg_names, args)), **kwargs} + ) + kernel = self.kernel_cache[entry_key] + # JitFunction + else: + if entry_key not in self.kernel_cache: + # compile + kernel = self.fn.run(*args, **kwargs) + self.kernel_cache[entry_key] = kernel + return + else: + # compiled + args = tuple([ + arg + for i, arg in enumerate(args) + if not self.fn.params[i].is_constexpr + ]) + kernel = self.kernel_cache[entry_key] + grid = kwargs["grid"] + if callable(grid): + # grid_fn + current = dict(**kwargs, **config) + meta = {**dict(zip(self.arg_names, args)), **current} + grid = grid(meta) + grid = grid + (1, 1) + + kernel[grid[0:3]](*args) + return + + +def libentry(): + """ + Decorator for triton library entries. + """ + + def decorator(fn): + return LibEntry(fn) + + return decorator diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 2873882bc263..27e91f5d1e4e 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_expand_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 2078a47d7e8e..2906500e7873 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_expand_slice_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 094bc62d9da4..c5bc1c08364c 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,7 +9,10 @@ import triton import triton.language as tl +from .libentry import libentry + +@libentry() @triton.jit def _sgmv_shrink_kernel( input_ptr, From b19ee954ca2a57312c3fbb85c5af79025537ed52 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 16 Jul 2024 10:14:18 +0800 Subject: [PATCH 54/71] format --- vllm/lora/ops/libentry.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py index a0bc5de9cb07..09790dc38c3c 100644 --- a/vllm/lora/ops/libentry.py +++ b/vllm/lora/ops/libentry.py @@ -3,8 +3,8 @@ import triton - class LibEntry(triton.KernelInterface): + def __init__( self, fn, @@ -55,9 +55,10 @@ def run(self, *args, **kwargs): else: # compiled for v, heur in self.fn.values.items(): - config[v] = heur( - {**dict(zip(self.arg_names, args)), **kwargs} - ) + config[v] = heur({ + **dict(zip(self.arg_names, args)), + **kwargs + }) kernel = self.kernel_cache[entry_key] # JitFunction else: @@ -69,8 +70,7 @@ def run(self, *args, **kwargs): else: # compiled args = tuple([ - arg - for i, arg in enumerate(args) + arg for i, arg in enumerate(args) if not self.fn.params[i].is_constexpr ]) kernel = self.kernel_cache[entry_key] From 68622d1b1814b074edd3e90497249a681d907f87 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 16 Jul 2024 17:28:19 +0800 Subject: [PATCH 55/71] optimize no lora step --- vllm/lora/ops/bgmv_expand.py | 5 ++-- vllm/lora/ops/bgmv_expand_slice.py | 8 +++---- vllm/lora/ops/bgmv_shrink.py | 5 ++-- vllm/lora/ops/sgmv_expand.py | 3 ++- vllm/lora/ops/sgmv_expand_slice.py | 5 ++-- vllm/lora/ops/sgmv_shrink.py | 3 ++- vllm/lora/punica.py | 38 ++++++++++++++++++++---------- 7 files changed, 42 insertions(+), 25 deletions(-) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 576559beeffe..91251fa0510d 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -37,7 +37,7 @@ def _bgmv_expand_kernel( CAST_TYPE: tl.constexpr, ): """ - GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's + GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's performance """ pid_sn = tl.program_id(axis=0) @@ -101,7 +101,8 @@ def bgmv_expand( lora_b_weights (torch.Tensor): lora'a weight output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch, An index of -1 means no lora should be + applied. batches (int): batch size add_inputs (bool, optional): Defaults to False. adds the final lora results to the output. diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 24f2b93f4bf2..31b2cd545d3d 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -38,7 +38,7 @@ def _bgmv_expand_slice_kernel( CAST_TYPE: tl.constexpr, ): """ - GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's + GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's performance """ pid_sn = tl.program_id(axis=0) @@ -105,7 +105,8 @@ def bgmv_expand_slice( lora_b_weights (torch.Tensor): lora'b weight output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch, An index of -1 means no lora should be + applied. slice_offst (int): output_tensor's offst slice_size (int): current output_tensor's size batches (int): batch size @@ -136,10 +137,7 @@ def bgmv_expand_slice( # TODO tuning this config N, K = lora_b_weights.shape[-2:] # K= rank,N=hidden_size - # BLOCK_N = 256 BLOCK_K = triton.next_power_of_2(K) - - # SPLIT_N = 64 EVEN_K = K % BLOCK_K == 0 ADD_INPUTS = add_inputs CAST_TYPE = False diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 6e3d90e2d235..1d8d23674d02 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -35,7 +35,7 @@ def _bgmv_shrink_kernel( SPLIT_K: tl.constexpr, ): """ - GroupGEMV,Additionally, introducing SPLIT-K can improve large hidden_size's + GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's performance """ pid_sk = tl.program_id(axis=0) @@ -93,7 +93,8 @@ def bgmv_shrink( lora_a_weights (torch.Tensor): lora'a weight output_tensor (torch.Tensor): output tensor lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch. An index of -1 means no lora should be + applied. batches (int): batch size scaling (float): Scaling factor. override_config (Optional[Dict[str, int]], optional): Defaults to None. diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 27e91f5d1e4e..7e82533e4a1f 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -121,7 +121,8 @@ def sgmv_expand( seq_len_tensor (torch.Tensor): (batch_size,). record the sequence length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch. An index of -1 means no lora should be + applied. batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 2906500e7873..261e562683d3 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -42,7 +42,7 @@ def _sgmv_expand_slice_kernel( Similar to the 'sgmv_expand' operator, but with an added parameter 'slice_offset'. The reason for not reusing the 'sgmv_expand' operator - might be that in the future, we could implement a fusion operator to + might be that in the future, we could implement a fusion operator to achieve the current functionality instead of having to call it multiple times. """ @@ -130,7 +130,8 @@ def sgmv_expand_slice( seq_len_tensor (torch.Tensor): (batch_size,). record the sequence length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch. An index of -1 means no lora should be + applied. batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index c5bc1c08364c..670117cee000 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -125,7 +125,8 @@ def sgmv_shrink( seq_len_tensor (torch.Tensor): (batch_size,). record the sequence length of the sequences in the batch lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index - corresponding to each batch + corresponding to each batch. An index of -1 means no lora should be + applied. batches (int): batch size max_seq_length (int): The max sequence lengths of the sequences in the batch diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 16d41cfa11ff..19b523002384 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -24,7 +24,7 @@ def compute_meta( token_lora_tensor: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, bool]: """ Get the information required for the sgmv kernel. With the features: 1. If consecutive requests in the batch use the same LoRA, this function @@ -40,14 +40,16 @@ def compute_meta( b_seq_start_tensor = torch.zeros_like(seq_length_tensor) b_seq_start_tensor[1:].copy_(cum_result[:-1]) max_length = seq_length_tensor.max().item() + batch_size = lora_indices_tensor.size(0) - return ( - b_seq_start_tensor, - seq_length_tensor, - lora_indices_tensor, - batch_size, - max_length, - ) + no_lora = False + # -1 means no lora should be applied. Use `no_lora` to determine whether + # the current step requires LoRA. If LoRA is not needed, the prefill stage + # does not need to launch the triton kernel, which can improve performance + if batch_size == 1 and lora_indices_tensor == -1: + no_lora = True + return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, + batch_size, max_length, no_lora) # TODO see if this can be vectorized @@ -174,7 +176,7 @@ class PunicaWrapper: """ PunicaWrapper is designed to manage and provide metadata for the punica kernel. The main function is to maintain the state information for - Multi-LoRA, and to provide the interface for the punica operator. + Multi-LoRA, and to provide the interface for the punica kernel. """ def __init__(self, max_num_batched_tokens: int, max_batches: int, @@ -213,6 +215,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, self.max_length: int = 0 self.batch_size: int = -1 self.is_prefill = False + self.no_lora = False def update_metadata( self, @@ -276,7 +279,7 @@ def _update_base_metadata( def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, - batch_size, max_length) = compute_meta(token_lora_tensor) + batch_size, max_length, no_lora) = compute_meta(token_lora_tensor) self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( b_seq_start_tensor) @@ -285,6 +288,7 @@ def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: lora_indices_tensor) self.batch_size = batch_size self.max_length = max_length + self.no_lora = no_lora @property def prefill_metadata( @@ -294,7 +298,8 @@ def prefill_metadata( metadata for prefill-related kernel computations. 1. seq_start_locs: Tensor of sequence start positions 2. seq_lengths: Tensor of sequence lengths - 3. lora_indices_per_batch: Tensor of lora indices + 3. lora_indices_per_batch: Tensor of lora indices, and an index of + -1 means no lora should be applied. 4. batch_size: batch size after clustering identical lora indices 5. max_length: The maximum sequence length in the batch """ @@ -307,7 +312,7 @@ def prefill_metadata( def token_lora_indices(self) -> torch.Tensor: """ This property provides the lora indices corresponding to each token - in the batch + in the batch. An index of -1 means no lora should be applied. """ token_lora_len = self.indices_len[0] return self._token_lora_indices[:token_lora_len] @@ -354,6 +359,9 @@ def shrink_prefill( w_t_all: torch.Tensor, scale: float, ): + #No LoRA request, so return directly + if self.no_lora: + return sgmv_shrink( x, w_t_all, @@ -378,6 +386,9 @@ def expand_prefill( w_t_all: torch.Tensor, add_input: bool, ): + #No LoRA request, so return directly + if self.no_lora: + return sgmv_expand( x, w_t_all, @@ -404,6 +415,9 @@ def expand_slice_prefill( y_slice_size: Optional[int], add_input: bool, ): + #No LoRA request, so return directly + if self.no_lora: + return sgmv_expand_slice( x, w_t_all, From e7b4a4e3a60cb957089209ab83509d1a08de64cc Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 17 Jul 2024 11:20:03 +0800 Subject: [PATCH 56/71] move libentry location --- vllm/lora/ops/sgmv_expand.py | 2 +- vllm/lora/ops/sgmv_expand_slice.py | 2 +- vllm/lora/ops/sgmv_shrink.py | 2 +- vllm/lora/punica.py | 2 +- vllm/triton_utils/__init__.py | 5 ++--- vllm/{lora/ops => triton_utils}/libentry.py | 0 6 files changed, 6 insertions(+), 7 deletions(-) rename vllm/{lora/ops => triton_utils}/libentry.py (100%) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 7e82533e4a1f..f4edde95345e 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,7 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry +from vllm.triton_utils import libentry @libentry() diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 261e562683d3..16181f3f7b74 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,7 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry +from vllm.triton_utils import libentry @libentry() diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 670117cee000..8ab049989abe 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,7 +9,7 @@ import triton import triton.language as tl -from .libentry import libentry +from vllm.triton_utils import libentry @libentry() diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 19b523002384..ac1392518590 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -17,7 +17,7 @@ from vllm.lora.ops.sgmv_shrink import sgmv_shrink if TYPE_CHECKING: - # avodi circuit import + # avoid circuit import from vllm.lora.layers import LoRAMapping from vllm.lora.models import LongContextLoRAContext diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 09843e5d1f30..e42f41a6779f 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -1,6 +1,5 @@ from vllm.triton_utils.custom_cache_manager import ( maybe_set_triton_cache_manager) +from vllm.triton_utils.libentry import libentry -__all__ = [ - "maybe_set_triton_cache_manager", -] +__all__ = ["maybe_set_triton_cache_manager", "libentry"] diff --git a/vllm/lora/ops/libentry.py b/vllm/triton_utils/libentry.py similarity index 100% rename from vllm/lora/ops/libentry.py rename to vllm/triton_utils/libentry.py From 008a9d7e1d656ccab9a1b31f0b265f3215532343 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Wed, 17 Jul 2024 15:47:44 +0800 Subject: [PATCH 57/71] test gemma lora --- tests/lora/test_gemma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 709246179bfe..478bb86b7861 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files): expected_lora_output = [ "more important than knowledge.\nAuthor: Albert Einstein\n", "everyone else is already taken.\nAuthor: Oscar Wilde\n", - "so little time\nAuthor: Frank Zappa\n", + "so little time.\nAuthor: Frank Zappa\n", ] output1 = do_sample(llm, gemma_lora_files, lora_id=1) From 5e112090ee972bff21e31ae711e366763a5c7f18 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Thu, 18 Jul 2024 14:11:08 +0800 Subject: [PATCH 58/71] cleanup code --- vllm/_custom_ops.py | 42 +------------------------------ vllm/lora/fully_sharded_layers.py | 2 -- vllm/lora/models.py | 1 + vllm/triton_utils/libentry.py | 9 ++++++- 4 files changed, 10 insertions(+), 44 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 4ca67224a91b..0130d3424c13 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,12 +13,9 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) -with contextlib.suppress(ImportError): - import vllm._moe_C - with contextlib.suppress(ImportError): # ruff: noqa: F401 - import vllm._punica_C + import vllm._moe_C def is_custom_op_supported(op_name: str) -> bool: @@ -471,43 +468,6 @@ def register_graph_buffers(fa: int, handles: List[str], torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) -# punica -def dispatch_bgmv( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.Tensor, - layer_idx: int, - scale: float, -) -> None: - torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, - scale) - - -def dispatch_bgmv_low_level( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.Tensor, - layer_idx: int, - scale: float, - h_in: int, - h_out: int, - y_offset: int, -) -> None: - torch.ops._punica_C.dispatch_bgmv_low_level( - y, - x, - w_t_all, - indicies, - layer_idx, - scale, - h_in, - h_out, - y_offset, - ) - - # temporary fix for https://github.com/vllm-project/vllm/issues/5456 # TODO: remove this in v0.6.0 names_and_values = globals() diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index cae7d593f123..f751434bb7b4 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -15,8 +15,6 @@ QKVParallelLinearWithLora, RowParallelLinearWithLoRA) -# from vllm.lora.punica import add_expand, add_expand_slice, add_shrink - if TYPE_CHECKING: pass diff --git a/vllm/lora/models.py b/vllm/lora/models.py index bbb2fca6e804..9a9b4766cf41 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -461,6 +461,7 @@ def _create_lora_modules(self): self.model.config)) self.register_module(module_name, new_module) self._register_packed_modules(module_name) + # All lora layers share the same punica_wrapper based on reference. new_module.set_mapping(self.punica_wrapper) def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index 09790dc38c3c..2a981663bebf 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -80,7 +80,10 @@ def run(self, *args, **kwargs): current = dict(**kwargs, **config) meta = {**dict(zip(self.arg_names, args)), **current} grid = grid(meta) - grid = grid + (1, 1) + if isinstance(grid, list): + grid = grid + [1, 1] + elif isinstance(grid, list): + grid = grid + (1, 1) kernel[grid[0:3]](*args) return @@ -89,6 +92,10 @@ def run(self, *args, **kwargs): def libentry(): """ Decorator for triton library entries. + Motivation: + The runtime overhead of Triton kernels is the reason for the lower + performance of small kernels, particularly evident with smaller models. + Using this decorator can reduce Triton runtime overhead. """ def decorator(fn): From 0c010fdae976fb4dc82fb65fb59d5a4f2cf3135e Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 19 Jul 2024 19:25:15 +0800 Subject: [PATCH 59/71] Verify libentry decorator for punica and sample kernels --- tests/kernels/test_sampler.py | 55 +++++++++++++-------- tests/lora/test_triton_punica.py | 83 +++++++++++++++++++++++--------- 2 files changed, 96 insertions(+), 42 deletions(-) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index e28f809309ec..34104c1818c7 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -1,4 +1,5 @@ import gc +from unittest.mock import patch import pytest import torch @@ -6,10 +7,11 @@ import triton.language as tl from vllm.model_executor.layers.ops.sample import ( - MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits, - sample) + MAX_TRITON_N_COLS, _sample_triton, _uniform_to_exponential, + get_num_triton_sampler_splits, sample) from vllm.model_executor.sampling_metadata import SamplingTensors from vllm.model_executor.utils import set_random_seed +from vllm.triton_utils.libentry import LibEntry SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 @@ -75,15 +77,20 @@ def test_sample_decoding_only(random_sampling, max_best_of, seeds = torch.randint(1, torch.iinfo(torch.long).max, (n_splits, bs), device="cuda").mul_(random_sampling_mask) - sampled_tokens, sampled_logprobs, sampled_modified_probs = sample( - probs=probs, - logprobs=logprobs, - sample_indices=sample_indices, - seeds=seeds, - max_best_of=max_best_of, - modify_greedy_probs=modify_greedy_probs, - save_logprobs=save_logprobs, - _save_modified_probs=True) + #The current _sample_triton does not utilize the + # libentry decoration. The purpose of adding this patch is to test + # the correctness of libentry. + with patch("vllm.model_executor.layers.ops.sample._sample_triton", + LibEntry(_sample_triton)): + sampled_tokens, sampled_logprobs, sampled_modified_probs = sample( + probs=probs, + logprobs=logprobs, + sample_indices=sample_indices, + seeds=seeds, + max_best_of=max_best_of, + modify_greedy_probs=modify_greedy_probs, + save_logprobs=save_logprobs, + _save_modified_probs=True) assert sampled_tokens.shape == (bs, max_best_of) for i in range(bs): assert torch.all(sampled_tokens[i] == i * (vocab_size // bs)) @@ -129,6 +136,7 @@ def test_sample_decoding_only(random_sampling, max_best_of, [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE]) def test_sample_prompt_logprobs(random_sampling, max_best_of, modify_greedy_probs, seed, vocab_size): + set_random_seed(seed) prompt_sizes = [16, 32, 64, 128] * 2 samples = 8 @@ -156,14 +164,17 @@ def test_sample_prompt_logprobs(random_sampling, max_best_of, seeds = torch.randint(1, torch.iinfo(torch.long).max, (n_splits, samples), device="cuda").mul_(random_sampling_mask) - sampled_tokens, sampled_logprobs, _ = sample( - probs=probs, - logprobs=logprobs, - sample_indices=sample_indices, - seeds=seeds, - max_best_of=max_best_of, - modify_greedy_probs=modify_greedy_probs, - save_logprobs=True) + #ditto + with patch("vllm.model_executor.layers.ops.sample._sample_triton", + LibEntry(_sample_triton)): + sampled_tokens, sampled_logprobs, _ = sample( + probs=probs, + logprobs=logprobs, + sample_indices=sample_indices, + seeds=seeds, + max_best_of=max_best_of, + modify_greedy_probs=modify_greedy_probs, + save_logprobs=True) assert sampled_tokens.shape == (samples, max_best_of) assert sampled_logprobs.shape == (samples, max_best_of) for i, t in enumerate(sample_indices): @@ -194,3 +205,9 @@ def test_get_sequence_seeds(seed): assert new_seq_seed_extra_entropy != new_seq_seed assert seq_seed != new_seq_seed seq_seed = new_seq_seed + + +if __name__ == "__main__": + pytest.main([ + "/home/sobey/Code/Code_leejee/vllm_main/vllm/tests/kernels/test_sampler.py::test_sample_prompt_logprobs" + ]) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index 3ed2f032241e..de935818c1c5 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -1,4 +1,5 @@ import random +from unittest.mock import patch import pytest import torch @@ -9,6 +10,7 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.triton_utils.libentry import LibEntry HIDDEN_SIZES = [ 128, @@ -323,6 +325,8 @@ def test_punica_bgmv( seed: int, device: str, ): + from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel + from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel random.seed(seed) torch.set_default_device(device) torch.random.manual_seed(seed) @@ -346,21 +350,29 @@ def test_punica_bgmv( ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length, dtype, op_type, device) if op_type == "shrink": - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + #The current _bgmv_shrink_kernel does not require the libentry + # decoration. The purpose of adding this patch is to test the + # correctness of libentry. + with patch("vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", + LibEntry(_bgmv_shrink_kernel)): + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + #ditto + with patch("vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", + LibEntry(_bgmv_expand_kernel)): + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) _torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -394,6 +406,7 @@ def test_punica_expand_nslices( seed: int, device: str, ): + from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel random.seed(seed) torch.set_default_device(device) torch.random.manual_seed(seed) @@ -446,15 +459,21 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + #The current _bgmv_expand_slice_kernel does not require the + # libentry decoration. The purpose of adding this patch is to test + # the correctness of libentry. + with patch( + "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", + LibEntry(_bgmv_expand_slice_kernel)): + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) _torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, @@ -468,3 +487,21 @@ def test_punica_expand_nslices( slice_offset += hidden_size assert_close(our_outputs, ref_outputs) + + +if __name__ == "__main__": + from itertools import product + lst = list( + product( + BATCHES, + NUM_LORA, + MAX_RANKS, + [1.0], + [torch.float16], + ["shrink"], + SEED, + CUDA_DEVICES, + )) + for ele in lst: + test_punica_bgmv(*ele) + print(f"{ele},pass") From 1a23abc214cb9b298e70681d24fc9009b838d5ff Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 22 Jul 2024 14:06:48 +0800 Subject: [PATCH 60/71] clean up code --- tests/kernels/test_sampler.py | 6 ------ vllm/triton_utils/libentry.py | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 34104c1818c7..8d6a622a5071 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -205,9 +205,3 @@ def test_get_sequence_seeds(seed): assert new_seq_seed_extra_entropy != new_seq_seed assert seq_seed != new_seq_seed seq_seed = new_seq_seed - - -if __name__ == "__main__": - pytest.main([ - "/home/sobey/Code/Code_leejee/vllm_main/vllm/tests/kernels/test_sampler.py::test_sample_prompt_logprobs" - ]) diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index d2d1ce969237..4bdbc4efd979 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -44,7 +44,6 @@ def key(self, spec_args, dns_args, const_args): def run(self, *args, **kwargs): grid = kwargs["grid"] - # collect all the arguments spec_args = [] # specialize arguments dns_args = [] # do not specialize arguments @@ -118,7 +117,7 @@ def run(self, *args, **kwargs): # captured args have higher priority filterd_constexprs = { k: v - for k, v in constexprs.items() if not isinstance(v, type) + for k, v in constexprs.items() if v is not inspect._empty } meta = { **dict(zip(self.arg_names, args)), From c1a0cd50ece137314f5ea5e8968331c29cd856cd Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 22 Jul 2024 17:05:13 +0800 Subject: [PATCH 61/71] modify libentry code --- vllm/triton_utils/libentry.py | 57 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index 4bdbc4efd979..d247d0a5adad 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -78,7 +78,7 @@ def run(self, *args, **kwargs): entry_key = self.key(spec_args, dns_args, const_args) if entry_key not in self.kernel_cache: - # compile kernel + # compiling the kernel also completes the related computations kernel = self.fn.run(*args, **kwargs) fn = self.fn # collect constexpr arguments for grid computation @@ -100,38 +100,43 @@ def run(self, *args, **kwargs): else: raise RuntimeError("Invalid Runtime Function") fn = fn.fn + # In vLLM, certain kernels like fused_moe_kernel get the + # best_config(as kwargs) from a configuration json file, rather + # than using Autotuner & Heuristics. Therefore, all their constexprs + # (tl.constexpr) are assigned values through the following loop. for p in self.jit_function.params: if p.is_constexpr and p.name not in constexprs: - constexprs[p.name] = p.default + constexprs[p.name] = p.default #default=inspect._empty self.kernel_cache[entry_key] = (kernel, constexprs) - return else: kernel, constexprs = self.kernel_cache[entry_key] - if callable(grid): - # collect all arguments to the grid fn,ie: - # 1. args, - # 2. kwargs, - # 3. all all other captured arguments in CompiledKernel from - # Autotunner & Heuristics when kwargs & captured args conflict, - # captured args have higher priority - filterd_constexprs = { - k: v - for k, v in constexprs.items() if v is not inspect._empty - } - meta = { - **dict(zip(self.arg_names, args)), - **kwargs, - **filterd_constexprs, - } + if callable(grid): + # collect all arguments to the grid fn,ie: + # 1. args, + # 2. kwargs, + # 3. all all other captured arguments in CompiledKernel from + # Autotunner & Heuristics when kwargs & captured args conflict, + # captured args have higher priority + # 4. We must filter out captured args with default value firstly + constexprs = { + k: v + for k, v in constexprs.items() if v is not inspect._empty + } + + meta = { + **dict(zip(self.arg_names, args)), + **kwargs, + **constexprs, + } grid = grid(meta) - if isinstance(grid, tuple): - grid = grid + (1, 1) - elif isinstance(grid, list): - grid = grid + [1, 1] - - kernel[grid[0:3]](*k_args) - return + if isinstance(grid, tuple): + grid = grid + (1, 1) + elif isinstance(grid, list): + grid = grid + [1, 1] + kernel[grid[0:3]](*k_args) + # maintaining the same return type as the JITFunction.run + return kernel def libentry(): From 4513dcf4b06b55911b3c58fc9c2b439950e1b872 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 22 Jul 2024 17:29:15 +0800 Subject: [PATCH 62/71] fix bug --- vllm/triton_utils/libentry.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index d247d0a5adad..85786fae334a 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -123,13 +123,12 @@ def run(self, *args, **kwargs): k: v for k, v in constexprs.items() if v is not inspect._empty } - meta = { **dict(zip(self.arg_names, args)), **kwargs, **constexprs, } - grid = grid(meta) + grid = grid(meta) if isinstance(grid, tuple): grid = grid + (1, 1) elif isinstance(grid, list): From c876e39e2cdd1a23209bb8ab188cdc2b02fe52bc Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Mon, 22 Jul 2024 23:31:19 +0800 Subject: [PATCH 63/71] modify libentry code and cleanup code --- tests/lora/test_quant_model.py | 48 +++++++++++++++++------------- vllm/lora/ops/bgmv_expand.py | 1 - vllm/lora/ops/bgmv_shrink.py | 1 - vllm/lora/ops/sgmv_expand.py | 1 - vllm/lora/ops/sgmv_expand_slice.py | 1 - vllm/triton_utils/libentry.py | 12 +++++++- 6 files changed, 38 insertions(+), 26 deletions(-) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 8fd968c69e58..2c78fbae397c 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size): # if torch.cuda.device_count() < tp_size: # pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - llm = vllm.LLM(model=model.model_path, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - max_model_len=400, - tensor_parallel_size=tp_size, - quantization=model.quantization, - trust_remote_code=True) + llm = vllm.LLM( + model=model.model_path, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + max_model_len=400, + tensor_parallel_size=tp_size, + gpu_memory_utilization=0.4, #avoid OOM + quantization=model.quantization, + trust_remote_code=True) if model.quantization is None: expected_no_lora_output = [ @@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model): # if torch.cuda.device_count() < 2: # pytest.skip(f"Not enough GPUs for tensor parallelism {2}") - llm_tp1 = vllm.LLM(model=model.model_path, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1, - quantization=model.quantization, - trust_remote_code=True) + llm_tp1 = vllm.LLM( + model=model.model_path, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1, + gpu_memory_utilization=0.4, #avoid OOM + quantization=model.quantization, + trust_remote_code=True) output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) del llm_tp1 cleanup() - llm_tp2 = vllm.LLM(model=model.model_path, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=2, - quantization=model.quantization) + llm_tp2 = vllm.LLM( + model=model.model_path, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=2, + gpu_memory_utilization=0.4, #avoid OOM + quantization=model.quantization) output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) del llm_tp2 diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 91251fa0510d..2d09c7cfe6c8 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -66,7 +66,6 @@ def _bgmv_expand_kernel( c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length for n in range(0, split_n_length, BLOCK_N): current_n = n + offset_n - # vector load current_n_c = tl.max_contiguous(current_n, BLOCK_N) b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :] < K) diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 1d8d23674d02..e69d33078f5a 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -51,7 +51,6 @@ def _bgmv_shrink_kernel( accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32) for k in range(0, K, BLOCK_K * SPLIT_K): current_k = k + offset_k - # vector load current_k_c = tl.max_contiguous(current_k, BLOCK_K) tiled_a = tl.load( a_ptr + current_k_c, diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index f4edde95345e..459049546909 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -76,7 +76,6 @@ def _sgmv_expand_kernel( other=0) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) - #TODO Can I use D=A@B+C ? accumulator += tl.dot( tiled_a, tiled_b, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 16181f3f7b74..ff3bcda071b8 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -82,7 +82,6 @@ def _sgmv_expand_slice_kernel( other=0) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) - # TODO Can I use D=A@B+C ? accumulator += tl.dot( tiled_a, tiled_b, diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index 85786fae334a..9e3774d98a54 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -78,7 +78,7 @@ def run(self, *args, **kwargs): entry_key = self.key(spec_args, dns_args, const_args) if entry_key not in self.kernel_cache: - # compiling the kernel also completes the related computations + # compile the kernel also completes the related computations kernel = self.fn.run(*args, **kwargs) fn = self.fn # collect constexpr arguments for grid computation @@ -109,6 +109,7 @@ def run(self, *args, **kwargs): constexprs[p.name] = p.default #default=inspect._empty self.kernel_cache[entry_key] = (kernel, constexprs) else: + # load kernel from cache directly kernel, constexprs = self.kernel_cache[entry_key] if callable(grid): @@ -145,6 +146,15 @@ def libentry(): The runtime overhead of Triton kernels is the reason for the lower performance of small kernels, particularly evident with smaller models. Using this decorator can reduce Triton runtime overhead. + How: + The `run` function of JITFunction needs to accomplish: + - Parameter binding using inspect + - KernelArg type wrapping + - Cache key calculation + When dealing with small size, these steps can become bottlenecks in + Triton runtime. Libentry simplifies these steps to reduce runtime + overhead, thereby improving the runtime expenses of small kernels. + """ def decorator(fn): From b02bce3aae76d50b60f43909e55061db27a857c5 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 23 Jul 2024 07:45:29 +0800 Subject: [PATCH 64/71] add a comment to libentry code --- vllm/triton_utils/libentry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index 9e3774d98a54..2654faa526db 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -154,6 +154,10 @@ def libentry(): When dealing with small size, these steps can become bottlenecks in Triton runtime. Libentry simplifies these steps to reduce runtime overhead, thereby improving the runtime expenses of small kernels. + NOTE: + When Triton is upgraded to version 3.0.0, libentry can be removed, + see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245 + """ From 89e96eb7676c4556be58abb461462e7f70a2afc5 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Tue, 23 Jul 2024 14:30:43 +0800 Subject: [PATCH 65/71] test lora CI --- tests/lora/test_quant_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 2c78fbae397c..2370c693e953 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -71,7 +71,7 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size): max_loras=4, max_model_len=400, tensor_parallel_size=tp_size, - gpu_memory_utilization=0.4, #avoid OOM + gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True) @@ -164,7 +164,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model): max_num_seqs=16, max_loras=4, tensor_parallel_size=1, - gpu_memory_utilization=0.4, #avoid OOM + gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True) output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) @@ -178,7 +178,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model): max_num_seqs=16, max_loras=4, tensor_parallel_size=2, - gpu_memory_utilization=0.4, #avoid OOM + gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization) output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) From 1f4a4721569ffd7f6141e7845bbbe5876c148348 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 26 Jul 2024 00:47:26 +0800 Subject: [PATCH 66/71] fix typo --- vllm/triton_utils/libentry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index 2654faa526db..ae00af44a048 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -135,7 +135,7 @@ def run(self, *args, **kwargs): elif isinstance(grid, list): grid = grid + [1, 1] kernel[grid[0:3]](*k_args) - # maintaining the same return type as the JITFunction.run + # maintaining the same return type as the JITFunction.run return kernel From 377847ad288a501daa01104a3cef79f54e478d9e Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 26 Jul 2024 00:49:59 +0800 Subject: [PATCH 67/71] modify test --- tests/lora/test_triton_punica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py index de935818c1c5..80b5ec017286 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_triton_punica.py @@ -65,7 +65,7 @@ ] BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)] -NUM_LORA = [1, 4, 8, 16, 32, 64, 128] +NUM_LORA = [1, 4, 8, 16] DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] SCALES = [0.5] From cd1fb05c6fd87e888b7eb6a99c40cd793fb384c7 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 26 Jul 2024 00:59:15 +0800 Subject: [PATCH 68/71] Trigger CI From 9ac909e9bbe1bbc7a0b00344866575766ff00942 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Fri, 26 Jul 2024 23:28:15 +0800 Subject: [PATCH 69/71] optimize bgmv_exapnd and enhance punica unit test --- ..._triton_punica.py => test_punica_sizes.py} | 283 +++++---------- tests/lora/test_punica_variation.py | 342 ++++++++++++++++++ tests/lora/utils.py | 148 ++++++++ vllm/lora/ops/bgmv_expand.py | 2 +- vllm/lora/ops/bgmv_expand_slice.py | 2 +- vllm/lora/ops/utils.py | 18 +- 6 files changed, 601 insertions(+), 194 deletions(-) rename tests/lora/{test_triton_punica.py => test_punica_sizes.py} (53%) create mode 100644 tests/lora/test_punica_variation.py diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_punica_sizes.py similarity index 53% rename from tests/lora/test_triton_punica.py rename to tests/lora/test_punica_sizes.py index 80b5ec017286..c052568dc2e3 100644 --- a/tests/lora/test_triton_punica.py +++ b/tests/lora/test_punica_sizes.py @@ -1,3 +1,9 @@ +""" +This script is mainly used to tests various hidden_sizes. We have collected the +hidden_sizes included in the LoRA models currently supported by vLLM. It tests +whether the corresponding Triton kernel can run normally when tensor parallelism +is set to [1, 2, 4, 8, 16, 32, 64]. +""" import random from unittest.mock import patch @@ -12,42 +18,67 @@ from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.triton_utils.libentry import LibEntry +from .utils import (generate_data, generate_data_for_expand_nslices, + ref_torch_groupgemm) + HIDDEN_SIZES = [ 128, 256, 512, + 896, 1024, 1152, + 1216, 1280, 1536, + 1664, 2048, + 2240, 2304, + 2368, + 2432, 2560, 2752, 3072, - 3424, + 3328, 3456, 3584, + 3712, 4096, + 4480, 4608, + 4736, + 4864, 5120, 5504, 5632, + 5888, 6144, + 6400, 6848, 6912, 7168, + 7424, 8192, + 8960, 9216, + 9472, 10240, 11008, + 11264, 13824, 14336, + 14784, + 14848, 15360, + 18944, 22016, + 22528, 24576, 27392, 27648, + 29568, + 29696, 32000, 32256, 32512, @@ -56,6 +87,9 @@ 36864, 43264, 49152, + 49408, + 60544, + 60672, 64000, 64256, 102400, @@ -63,11 +97,20 @@ 128000, 128256, ] +#The size of TP +divisibility = [1, 2, 4, 8, 16, 32, 64] + +all_hidden_size = [] +for div in divisibility: + for hidden_size in HIDDEN_SIZES: + all_hidden_size.append(hidden_size // div) -BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)] -NUM_LORA = [1, 4, 8, 16] +HIDDEN_SIZES = list(set(all_hidden_size)) + +BATCHES = [4] +NUM_LORA = [4] DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +MAX_RANKS = [32] SCALES = [0.5] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] @@ -82,150 +125,10 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -def _torch_groupgemm( - out_tensor, - inputs, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling, - op_type, -) -> torch.Tensor: - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batches), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_weights[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - result *= scaling - out_list.append(result) - cat_result = torch.cat(out_list, dim=0) - if op_type == "expand": - out_tensor += cat_result - else: - out_tensor.copy_(cat_result) - return - - -def _generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, - dtype, op_type, device): - seq_len_tensor = torch.randint(seq_length, seq_length + 1, - (batches, )).to(device) - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).to(device) - total_tokens = seq_len_tensor.sum() - if op_type == "shrink": - inputs_tensor = torch.rand((total_tokens, hidden_size), - dtype=dtype).to(device) - lora_weights = torch.rand( - (lora_nums, max_rank, hidden_size), # col-major - dtype=dtype, - ).to(device) - # shrink op need atomic_add, so output is initinized by 0 - ref_out_tensor = torch.zeros((total_tokens, max_rank), - dtype=dtype, - device=inputs_tensor.device) - # NOTE shrink kernel using torch.float32 as output type - our_out_tensor = torch.zeros((total_tokens, max_rank), - dtype=torch.float32).to(device) - else: - inputs_tensor = torch.rand( - (total_tokens, max_rank), - dtype=dtype, - ).to(device) - lora_weights = torch.rand( - (lora_nums, hidden_size, max_rank), # col-major - dtype=dtype, - ).to(device) - # expand op needs to complete y+=a@lora_b, so output is - # initinized randomly - ref_out_tensor = torch.rand( - (total_tokens, hidden_size), - dtype=dtype, - ).to(device) - # Ensure the same input. - our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint(0, - lora_nums - 1 if lora_nums > 1 else 1, - (batches, )).to(device) - indices = torch.zeros((total_tokens), dtype=torch.long).to(device) - current_offset = 0 - for b_id in range(batches): - lora_index = lora_indices_tensor[b_id] - indices[current_offset:current_offset + - seq_len_tensor[b_id]].copy_(lora_index) - current_offset += seq_len_tensor[b_id].item() - return ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) - - -def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank, - seq_length, dtype, nslices, device): - try: - seq_len_tensor = torch.randint(seq_length, seq_length + 1, - (batches, )).to(device) - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0, - ).to(device) - total_tokens = seq_len_tensor.sum() - inputs_tensor = torch.rand( - (total_tokens, max_rank), - dtype=dtype, - ).to(device) - lora_weights_lst = [] - for _ in range(nslices): - lora_weights_lst.append( - torch.rand( - (lora_nums, hidden_size, max_rank), # col-major - dtype=dtype, - ).to(device)) - # expand op needs to complete y+=a@lora_b, so output is - # initinized randomly - ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), - dtype=dtype).to(device) - # Ensure the same input. - our_out_tensor = ref_out_tensor.clone() - lora_indices_tensor = torch.randint( - 0, lora_nums - 1 if lora_nums > 1 else 1, (batches, )) - indices = torch.zeros((total_tokens), dtype=torch.long).to(device) - current_offset = 0 - for b_id in range(batches): - lora_index = lora_indices_tensor[b_id] - indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() - current_offset += seq_len_tensor[b_id].item() - - lora_indices_tensor = lora_indices_tensor.to(device) - return ( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) - except Exception as error: - raise error - - @pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @@ -235,6 +138,7 @@ def test_punica_sgmv( batches: int, num_loras: int, rank: int, + hidden_size: int, scaling: float, dtype: torch.dtype, op_type: str, @@ -247,10 +151,6 @@ def test_punica_sgmv( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) - hidden_size = HIDDEN_SIZES[hidden_size_index] - if hidden_size > 100000: - hidden_size = hidden_size // 4 # avoid OOM seq_length = 128 ( inputs_tensor, @@ -261,8 +161,16 @@ def test_punica_sgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length, - dtype, op_type, device) + ) = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + op_type, + device, + ) max_seq_length = seq_len_tensor.max() if isinstance(max_seq_length, tuple): max_seq_length = max_seq_length[0].item() @@ -292,7 +200,7 @@ def test_punica_sgmv( max_seq_length, add_inputs=True, ) - _torch_groupgemm( + ref_torch_groupgemm( ref_out_tensor, inputs_tensor, lora_weights, @@ -310,6 +218,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @@ -319,6 +228,7 @@ def test_punica_bgmv( batches: int, num_loras: int, rank: int, + hidden_size: int, scaling: float, dtype: torch.dtype, op_type: str, @@ -327,16 +237,13 @@ def test_punica_bgmv( ): from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel + random.seed(seed) torch.set_default_device(device) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) - hidden_size = HIDDEN_SIZES[hidden_size_index] - if hidden_size > 100000: - hidden_size = hidden_size // 4 # avoid OOM seq_length = 1 ( inputs_tensor, @@ -347,14 +254,24 @@ def test_punica_bgmv( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length, - dtype, op_type, device) + ) = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + op_type, + device, + ) if op_type == "shrink": - #The current _bgmv_shrink_kernel does not require the libentry + # The current _bgmv_shrink_kernel does not require the libentry # decoration. The purpose of adding this patch is to test the # correctness of libentry. - with patch("vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel)): + with patch( + "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", + LibEntry(_bgmv_shrink_kernel), + ): bgmv_shrink( inputs_tensor, lora_weights, @@ -363,9 +280,11 @@ def test_punica_bgmv( scaling, ) else: - #ditto - with patch("vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel)): + # ditto + with patch( + "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", + LibEntry(_bgmv_expand_kernel), + ): bgmv_expand( inputs_tensor, lora_weights, @@ -373,7 +292,7 @@ def test_punica_bgmv( indices, add_inputs=True, ) - _torch_groupgemm( + ref_torch_groupgemm( ref_out_tensor, inputs_tensor, lora_weights, @@ -391,6 +310,7 @@ def test_punica_bgmv( @pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @@ -400,6 +320,7 @@ def test_punica_expand_nslices( batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, dtype: torch.dtype, op_type: str, @@ -407,15 +328,12 @@ def test_punica_expand_nslices( device: str, ): from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel + random.seed(seed) torch.set_default_device(device) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1) - hidden_size = HIDDEN_SIZES[hidden_size_index] - if hidden_size > 100000: - hidden_size = hidden_size // 4 # avoid OOM seq_length = 128 if op_type == "sgmv" else 1 ( inputs_tensor, @@ -426,7 +344,7 @@ def test_punica_expand_nslices( lora_indices_tensor, seq_len_tensor, indices, - ) = _generate_data_expand_nslices( + ) = generate_data_for_expand_nslices( batches, hidden_size, num_loras, @@ -459,12 +377,13 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - #The current _bgmv_expand_slice_kernel does not require the + # The current _bgmv_expand_slice_kernel does not require the # libentry decoration. The purpose of adding this patch is to test # the correctness of libentry. with patch( "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel)): + LibEntry(_bgmv_expand_slice_kernel), + ): bgmv_expand_slice( inputs_tensor, lora_weights, @@ -474,7 +393,7 @@ def test_punica_expand_nslices( slice_size=hidden_size, add_inputs=True, ) - _torch_groupgemm( + ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, lora_weights, @@ -487,21 +406,3 @@ def test_punica_expand_nslices( slice_offset += hidden_size assert_close(our_outputs, ref_outputs) - - -if __name__ == "__main__": - from itertools import product - lst = list( - product( - BATCHES, - NUM_LORA, - MAX_RANKS, - [1.0], - [torch.float16], - ["shrink"], - SEED, - CUDA_DEVICES, - )) - for ele in lst: - test_punica_bgmv(*ele) - print(f"{ele},pass") diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py new file mode 100644 index 000000000000..7e73ea67ee5f --- /dev/null +++ b/tests/lora/test_punica_variation.py @@ -0,0 +1,342 @@ +""" +This script is mainly used to test whether trtion kernels can run normally +under different conditions, including various batches, numbers of LoRA , and +maximum ranks. +""" +import random +from unittest.mock import patch + +import pytest +import torch + +from vllm.lora.ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice +from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.triton_utils.libentry import LibEntry + +from .utils import (generate_data, generate_data_for_expand_nslices, + ref_torch_groupgemm) + +HIDDEN_SIZES = [3424, 4096, 4097] + +BATCHES = [1, 4, 16, 32] +NUM_LORA = [1, 4, 8, 16, 32, 64, 128] +DTYPES = [torch.float16, torch.bfloat16] +MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +SCALES = [0.5] +SEED = [0] +CUDA_DEVICES = [f"cuda:{0}"] + + +def assert_close(a, b): + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[a.dtype] + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("batches", BATCHES) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_punica_sgmv( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + random.seed(seed) + torch.set_default_device(device) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + seq_length = 128 + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + op_type, + device, + ) + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + if op_type == "shrink": + sgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + scaling, + ) + else: + sgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + add_inputs=True, + ) + ref_torch_groupgemm( + ref_out_tensor, + inputs_tensor, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batches, + scaling if op_type == "shrink" else 1.0, + op_type, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.parametrize("batches", BATCHES) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_punica_bgmv( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + scaling: float, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel + from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel + + random.seed(seed) + torch.set_default_device(device) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + seq_length = 1 + ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + op_type, + device, + ) + if op_type == "shrink": + # The current _bgmv_shrink_kernel does not require the libentry + # decoration. The purpose of adding this patch is to test the + # correctness of libentry. + with patch( + "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", + LibEntry(_bgmv_shrink_kernel), + ): + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) + else: + # ditto + with patch( + "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", + LibEntry(_bgmv_expand_kernel), + ): + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) + ref_torch_groupgemm( + ref_out_tensor, + inputs_tensor, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batches, + scaling if op_type == "shrink" else 1.0, + op_type, + ) + if op_type == "shrink": + ref_out_tensor = ref_out_tensor.to(torch.float32) + assert_close(our_out_tensor, ref_out_tensor) + + +@pytest.mark.parametrize("batches", BATCHES) +@pytest.mark.parametrize("num_loras", NUM_LORA) +@pytest.mark.parametrize("rank", MAX_RANKS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("nslices", [2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_punica_expand_nslices( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + nslices: int, + dtype: torch.dtype, + op_type: str, + seed: int, + device: str, +): + from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel + + random.seed(seed) + torch.set_default_device(device) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + seq_length = 128 if op_type == "sgmv" else 1 + ( + inputs_tensor, + lora_weights_lst, + our_outputs, + ref_outputs, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) = generate_data_for_expand_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + nslices, + device, + ) + max_seq_length = seq_len_tensor.max() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + if op_type == "sgmv": + sgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + slice_offset, + hidden_size, + add_inputs=True, + ) + else: + # The current _bgmv_expand_slice_kernel does not require the + # libentry decoration. The purpose of adding this patch is to test + # the correctness of libentry. + with patch( + "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", + LibEntry(_bgmv_expand_slice_kernel), + ): + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + ref_torch_groupgemm( + ref_outputs[:, slice_offset:slice_offset + hidden_size], + inputs_tensor, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batches, + 1.0, + op_type="expand", + ) + + slice_offset += hidden_size + assert_close(our_outputs, ref_outputs) + + +if __name__ == "__main__": + from itertools import product + + lst = list( + product( + BATCHES, + NUM_LORA, + MAX_RANKS, + [1.0], + [torch.float16], + ["expand"], + SEED, + CUDA_DEVICES, + )) + for ele in lst: + test_punica_bgmv(*ele) + print(f"{ele},pass") diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b73cf5bf5532..00f8e26d1041 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -86,3 +86,151 @@ def init_packed_lora( packed_lora = PackedLoRALayerWeights.pack(base_loras) self.set_module_lora(module_name, packed_lora) return packed_lora + + +def assert_close(a, b): + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[a.dtype] + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) + + +def ref_torch_groupgemm( + out_tensor, + inputs, + lora_weights, + lora_indices_tensor, + seq_len_tensor, + batches, + scaling, + op_type, +) -> torch.Tensor: + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_len_tensor): + input_weight = inputs[current_offset:b_length + current_offset, :] + current_offset += b_length + lora_weight = lora_weights[lora_indices_tensor[lora_index]] + result = torch.nn.functional.linear(input_weight, lora_weight) + result *= scaling + out_list.append(result) + cat_result = torch.cat(out_list, dim=0) + if op_type == "expand": + out_tensor += cat_result + else: + out_tensor.copy_(cat_result) + return + + +def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, + op_type, device): + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batches, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + if op_type == "shrink": + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) + lora_weights = torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).to(device) + # shrink op need atomic_add, so output is initinized by 0 + ref_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=dtype, + device=inputs_tensor.device) + # NOTE shrink kernel using torch.float32 as output type + our_out_tensor = torch.zeros((total_tokens, max_rank), + dtype=torch.float32).to(device) + else: + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights = torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand( + (total_tokens, hidden_size), + dtype=dtype, + ).to(device) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batches, )).to(device) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batches): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]].copy_(lora_index) + current_offset += seq_len_tensor[b_id].item() + return ( + inputs_tensor, + lora_weights, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + + +def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, + seq_length, dtype, nslices, device): + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batches, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + inputs_tensor = torch.rand( + (total_tokens, max_rank), + dtype=dtype, + ).to(device) + lora_weights_lst = [] + for _ in range(nslices): + lora_weights_lst.append( + torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device)) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), + dtype=dtype).to(device) + # Ensure the same input. + our_out_tensor = ref_out_tensor.clone() + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batches, )) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batches): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = lora_index.item() + current_offset += seq_len_tensor[b_id].item() + + lora_indices_tensor = lora_indices_tensor.to(device) + return ( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 2d09c7cfe6c8..dcaf2e3d462c 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -56,7 +56,7 @@ def _bgmv_expand_kernel( mask=offset_k < K, other=0, ) # [BLOCK_K] - + # N must be divisible by SPLIT_N split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 31b2cd545d3d..fa6571074f3a 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -57,7 +57,7 @@ def _bgmv_expand_slice_kernel( mask=offset_k < K, other=0, ) # [BLOCK_K] - + # N must be divisible by SPLIT_N split_n_length = tl.cdiv(N, SPLIT_N) if CAST_TYPE: tiled_a = tiled_a.to(lora_ptr.dtype.element_ty) diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py index 6a637288f71e..7c3e27313ad9 100644 --- a/vllm/lora/ops/utils.py +++ b/vllm/lora/ops/utils.py @@ -8,9 +8,25 @@ def _get_op_configs(op_type: str, batch: int, hidden_size: int): return None +def _check_divisibility(hidden_size: int): + # The bgmv_expand kernel requires that the hidden_size be divisible by + # the number below. + divisibility = [2, 4, 8, 16, 32, 64] + divisibility.sort(reverse=True) + for div in divisibility: + if hidden_size % div == 0: + return div + # hidden_size is an odd number + return 1 + + def _get_default_config(op_type: str, batch: int, hidden_size: int): if op_type == "expand": - return {"BLOCK_N": 256, "SPLIT_N": 64, "num_warps": 8} + return { + "BLOCK_N": 256, + "SPLIT_N": _check_divisibility(hidden_size), + "num_warps": 8 + } else: return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8} From 9a4f1472cb1f70d0c2527c024e96e98f3a3ccf52 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Sun, 28 Jul 2024 21:55:44 +0800 Subject: [PATCH 70/71] fix docstring bug --- vllm/lora/fully_sharded_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index f751434bb7b4..a7887a048746 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -95,7 +95,7 @@ def can_replace_layer( def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): """ MergedColumnParallelLinearWithShardedLoRA and - QKVParallelLinearWithShardedLora share the same + MergedQKVParallelLinearWithShardedLora share the same LoRa weight application method. The main difference is the step by shard_size for lora_b which can From 6620ffb7bd73449f464e88fa66b62a6bee0e81d8 Mon Sep 17 00:00:00 2001 From: jeejeeli Date: Sun, 28 Jul 2024 22:38:02 +0800 Subject: [PATCH 71/71] modify max batches --- vllm/lora/models.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9a9b4766cf41..017a1002bb9a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -31,9 +31,6 @@ _GLOBAL_LORA_ID = 0 -# NOTE This value comes from vllm/worker/model_runner.py -_MAX_BATCH_SIZE = 256 - @dataclass class LongContextLoRAContext: @@ -318,7 +315,7 @@ def __init__( self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, - max_batches=_MAX_BATCH_SIZE, + max_batches=self.max_num_seqs, device="cuda") # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora.